mm/memcontrol.c at v6.6-rc3 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / mm / memcontrol.c
at v6.6-rc3 7900 lines 208 kB view raw
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/* memcontrol.c - Memory Controller
   3 *
   4 * Copyright IBM Corporation, 2007
   5 * Author Balbir Singh <balbir@linux.vnet.ibm.com>
   6 *
   7 * Copyright 2007 OpenVZ SWsoft Inc
   8 * Author: Pavel Emelianov <xemul@openvz.org>
   9 *
  10 * Memory thresholds
  11 * Copyright (C) 2009 Nokia Corporation
  12 * Author: Kirill A. Shutemov
  13 *
  14 * Kernel Memory Controller
  15 * Copyright (C) 2012 Parallels Inc. and Google Inc.
  16 * Authors: Glauber Costa and Suleiman Souhlal
  17 *
  18 * Native page reclaim
  19 * Charge lifetime sanitation
  20 * Lockless page tracking & accounting
  21 * Unified hierarchy configuration model
  22 * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner
  23 *
  24 * Per memcg lru locking
  25 * Copyright (C) 2020 Alibaba, Inc, Alex Shi
  26 */
  27
  28#include <linux/page_counter.h>
  29#include <linux/memcontrol.h>
  30#include <linux/cgroup.h>
  31#include <linux/pagewalk.h>
  32#include <linux/sched/mm.h>
  33#include <linux/shmem_fs.h>
  34#include <linux/hugetlb.h>
  35#include <linux/pagemap.h>
  36#include <linux/vm_event_item.h>
  37#include <linux/smp.h>
  38#include <linux/page-flags.h>
  39#include <linux/backing-dev.h>
  40#include <linux/bit_spinlock.h>
  41#include <linux/rcupdate.h>
  42#include <linux/limits.h>
  43#include <linux/export.h>
  44#include <linux/mutex.h>
  45#include <linux/rbtree.h>
  46#include <linux/slab.h>
  47#include <linux/swap.h>
  48#include <linux/swapops.h>
  49#include <linux/spinlock.h>
  50#include <linux/eventfd.h>
  51#include <linux/poll.h>
  52#include <linux/sort.h>
  53#include <linux/fs.h>
  54#include <linux/seq_file.h>
  55#include <linux/vmpressure.h>
  56#include <linux/memremap.h>
  57#include <linux/mm_inline.h>
  58#include <linux/swap_cgroup.h>
  59#include <linux/cpu.h>
  60#include <linux/oom.h>
  61#include <linux/lockdep.h>
  62#include <linux/file.h>
  63#include <linux/resume_user_mode.h>
  64#include <linux/psi.h>
  65#include <linux/seq_buf.h>
  66#include <linux/sched/isolation.h>
  67#include "internal.h"
  68#include <net/sock.h>
  69#include <net/ip.h>
  70#include "slab.h"
  71#include "swap.h"
  72
  73#include <linux/uaccess.h>
  74
  75#include <trace/events/vmscan.h>
  76
  77struct cgroup_subsys memory_cgrp_subsys __read_mostly;
  78EXPORT_SYMBOL(memory_cgrp_subsys);
  79
  80struct mem_cgroup *root_mem_cgroup __read_mostly;
  81
  82/* Active memory cgroup to use from an interrupt context */
  83DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg);
  84EXPORT_PER_CPU_SYMBOL_GPL(int_active_memcg);
  85
  86/* Socket memory accounting disabled? */
  87static bool cgroup_memory_nosocket __ro_after_init;
  88
  89/* Kernel memory accounting disabled? */
  90static bool cgroup_memory_nokmem __ro_after_init;
  91
  92/* BPF memory accounting disabled? */
  93static bool cgroup_memory_nobpf __ro_after_init;
  94
  95#ifdef CONFIG_CGROUP_WRITEBACK
  96static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
  97#endif
  98
  99/* Whether legacy memory+swap accounting is active */
 100static bool do_memsw_account(void)
 101{
 102	return !cgroup_subsys_on_dfl(memory_cgrp_subsys);
 103}
 104
 105#define THRESHOLDS_EVENTS_TARGET 128
 106#define SOFTLIMIT_EVENTS_TARGET 1024
 107
 108/*
 109 * Cgroups above their limits are maintained in a RB-Tree, independent of
 110 * their hierarchy representation
 111 */
 112
 113struct mem_cgroup_tree_per_node {
 114	struct rb_root rb_root;
 115	struct rb_node *rb_rightmost;
 116	spinlock_t lock;
 117};
 118
 119struct mem_cgroup_tree {
 120	struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
 121};
 122
 123static struct mem_cgroup_tree soft_limit_tree __read_mostly;
 124
 125/* for OOM */
 126struct mem_cgroup_eventfd_list {
 127	struct list_head list;
 128	struct eventfd_ctx *eventfd;
 129};
 130
 131/*
 132 * cgroup_event represents events which userspace want to receive.
 133 */
 134struct mem_cgroup_event {
 135	/*
 136	 * memcg which the event belongs to.
 137	 */
 138	struct mem_cgroup *memcg;
 139	/*
 140	 * eventfd to signal userspace about the event.
 141	 */
 142	struct eventfd_ctx *eventfd;
 143	/*
 144	 * Each of these stored in a list by the cgroup.
 145	 */
 146	struct list_head list;
 147	/*
 148	 * register_event() callback will be used to add new userspace
 149	 * waiter for changes related to this event.  Use eventfd_signal()
 150	 * on eventfd to send notification to userspace.
 151	 */
 152	int (*register_event)(struct mem_cgroup *memcg,
 153			      struct eventfd_ctx *eventfd, const char *args);
 154	/*
 155	 * unregister_event() callback will be called when userspace closes
 156	 * the eventfd or on cgroup removing.  This callback must be set,
 157	 * if you want provide notification functionality.
 158	 */
 159	void (*unregister_event)(struct mem_cgroup *memcg,
 160				 struct eventfd_ctx *eventfd);
 161	/*
 162	 * All fields below needed to unregister event when
 163	 * userspace closes eventfd.
 164	 */
 165	poll_table pt;
 166	wait_queue_head_t *wqh;
 167	wait_queue_entry_t wait;
 168	struct work_struct remove;
 169};
 170
 171static void mem_cgroup_threshold(struct mem_cgroup *memcg);
 172static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
 173
 174/* Stuffs for move charges at task migration. */
 175/*
 176 * Types of charges to be moved.
 177 */
 178#define MOVE_ANON	0x1U
 179#define MOVE_FILE	0x2U
 180#define MOVE_MASK	(MOVE_ANON | MOVE_FILE)
 181
 182/* "mc" and its members are protected by cgroup_mutex */
 183static struct move_charge_struct {
 184	spinlock_t	  lock; /* for from, to */
 185	struct mm_struct  *mm;
 186	struct mem_cgroup *from;
 187	struct mem_cgroup *to;
 188	unsigned long flags;
 189	unsigned long precharge;
 190	unsigned long moved_charge;
 191	unsigned long moved_swap;
 192	struct task_struct *moving_task;	/* a task moving charges */
 193	wait_queue_head_t waitq;		/* a waitq for other context */
 194} mc = {
 195	.lock = __SPIN_LOCK_UNLOCKED(mc.lock),
 196	.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
 197};
 198
 199/*
 200 * Maximum loops in mem_cgroup_soft_reclaim(), used for soft
 201 * limit reclaim to prevent infinite loops, if they ever occur.
 202 */
 203#define	MEM_CGROUP_MAX_RECLAIM_LOOPS		100
 204#define	MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS	2
 205
 206/* for encoding cft->private value on file */
 207enum res_type {
 208	_MEM,
 209	_MEMSWAP,
 210	_KMEM,
 211	_TCP,
 212};
 213
 214#define MEMFILE_PRIVATE(x, val)	((x) << 16 | (val))
 215#define MEMFILE_TYPE(val)	((val) >> 16 & 0xffff)
 216#define MEMFILE_ATTR(val)	((val) & 0xffff)
 217
 218/*
 219 * Iteration constructs for visiting all cgroups (under a tree).  If
 220 * loops are exited prematurely (break), mem_cgroup_iter_break() must
 221 * be used for reference counting.
 222 */
 223#define for_each_mem_cgroup_tree(iter, root)		\
 224	for (iter = mem_cgroup_iter(root, NULL, NULL);	\
 225	     iter != NULL;				\
 226	     iter = mem_cgroup_iter(root, iter, NULL))
 227
 228#define for_each_mem_cgroup(iter)			\
 229	for (iter = mem_cgroup_iter(NULL, NULL, NULL);	\
 230	     iter != NULL;				\
 231	     iter = mem_cgroup_iter(NULL, iter, NULL))
 232
 233static inline bool task_is_dying(void)
 234{
 235	return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
 236		(current->flags & PF_EXITING);
 237}
 238
 239/* Some nice accessors for the vmpressure. */
 240struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
 241{
 242	if (!memcg)
 243		memcg = root_mem_cgroup;
 244	return &memcg->vmpressure;
 245}
 246
 247struct mem_cgroup *vmpressure_to_memcg(struct vmpressure *vmpr)
 248{
 249	return container_of(vmpr, struct mem_cgroup, vmpressure);
 250}
 251
 252#ifdef CONFIG_MEMCG_KMEM
 253static DEFINE_SPINLOCK(objcg_lock);
 254
 255bool mem_cgroup_kmem_disabled(void)
 256{
 257	return cgroup_memory_nokmem;
 258}
 259
 260static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
 261				      unsigned int nr_pages);
 262
 263static void obj_cgroup_release(struct percpu_ref *ref)
 264{
 265	struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt);
 266	unsigned int nr_bytes;
 267	unsigned int nr_pages;
 268	unsigned long flags;
 269
 270	/*
 271	 * At this point all allocated objects are freed, and
 272	 * objcg->nr_charged_bytes can't have an arbitrary byte value.
 273	 * However, it can be PAGE_SIZE or (x * PAGE_SIZE).
 274	 *
 275	 * The following sequence can lead to it:
 276	 * 1) CPU0: objcg == stock->cached_objcg
 277	 * 2) CPU1: we do a small allocation (e.g. 92 bytes),
 278	 *          PAGE_SIZE bytes are charged
 279	 * 3) CPU1: a process from another memcg is allocating something,
 280	 *          the stock if flushed,
 281	 *          objcg->nr_charged_bytes = PAGE_SIZE - 92
 282	 * 5) CPU0: we do release this object,
 283	 *          92 bytes are added to stock->nr_bytes
 284	 * 6) CPU0: stock is flushed,
 285	 *          92 bytes are added to objcg->nr_charged_bytes
 286	 *
 287	 * In the result, nr_charged_bytes == PAGE_SIZE.
 288	 * This page will be uncharged in obj_cgroup_release().
 289	 */
 290	nr_bytes = atomic_read(&objcg->nr_charged_bytes);
 291	WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1));
 292	nr_pages = nr_bytes >> PAGE_SHIFT;
 293
 294	if (nr_pages)
 295		obj_cgroup_uncharge_pages(objcg, nr_pages);
 296
 297	spin_lock_irqsave(&objcg_lock, flags);
 298	list_del(&objcg->list);
 299	spin_unlock_irqrestore(&objcg_lock, flags);
 300
 301	percpu_ref_exit(ref);
 302	kfree_rcu(objcg, rcu);
 303}
 304
 305static struct obj_cgroup *obj_cgroup_alloc(void)
 306{
 307	struct obj_cgroup *objcg;
 308	int ret;
 309
 310	objcg = kzalloc(sizeof(struct obj_cgroup), GFP_KERNEL);
 311	if (!objcg)
 312		return NULL;
 313
 314	ret = percpu_ref_init(&objcg->refcnt, obj_cgroup_release, 0,
 315			      GFP_KERNEL);
 316	if (ret) {
 317		kfree(objcg);
 318		return NULL;
 319	}
 320	INIT_LIST_HEAD(&objcg->list);
 321	return objcg;
 322}
 323
 324static void memcg_reparent_objcgs(struct mem_cgroup *memcg,
 325				  struct mem_cgroup *parent)
 326{
 327	struct obj_cgroup *objcg, *iter;
 328
 329	objcg = rcu_replace_pointer(memcg->objcg, NULL, true);
 330
 331	spin_lock_irq(&objcg_lock);
 332
 333	/* 1) Ready to reparent active objcg. */
 334	list_add(&objcg->list, &memcg->objcg_list);
 335	/* 2) Reparent active objcg and already reparented objcgs to parent. */
 336	list_for_each_entry(iter, &memcg->objcg_list, list)
 337		WRITE_ONCE(iter->memcg, parent);
 338	/* 3) Move already reparented objcgs to the parent's list */
 339	list_splice(&memcg->objcg_list, &parent->objcg_list);
 340
 341	spin_unlock_irq(&objcg_lock);
 342
 343	percpu_ref_kill(&objcg->refcnt);
 344}
 345
 346/*
 347 * A lot of the calls to the cache allocation functions are expected to be
 348 * inlined by the compiler. Since the calls to memcg_slab_pre_alloc_hook() are
 349 * conditional to this static branch, we'll have to allow modules that does
 350 * kmem_cache_alloc and the such to see this symbol as well
 351 */
 352DEFINE_STATIC_KEY_FALSE(memcg_kmem_online_key);
 353EXPORT_SYMBOL(memcg_kmem_online_key);
 354
 355DEFINE_STATIC_KEY_FALSE(memcg_bpf_enabled_key);
 356EXPORT_SYMBOL(memcg_bpf_enabled_key);
 357#endif
 358
 359/**
 360 * mem_cgroup_css_from_folio - css of the memcg associated with a folio
 361 * @folio: folio of interest
 362 *
 363 * If memcg is bound to the default hierarchy, css of the memcg associated
 364 * with @folio is returned.  The returned css remains associated with @folio
 365 * until it is released.
 366 *
 367 * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup
 368 * is returned.
 369 */
 370struct cgroup_subsys_state *mem_cgroup_css_from_folio(struct folio *folio)
 371{
 372	struct mem_cgroup *memcg = folio_memcg(folio);
 373
 374	if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
 375		memcg = root_mem_cgroup;
 376
 377	return &memcg->css;
 378}
 379
 380/**
 381 * page_cgroup_ino - return inode number of the memcg a page is charged to
 382 * @page: the page
 383 *
 384 * Look up the closest online ancestor of the memory cgroup @page is charged to
 385 * and return its inode number or 0 if @page is not charged to any cgroup. It
 386 * is safe to call this function without holding a reference to @page.
 387 *
 388 * Note, this function is inherently racy, because there is nothing to prevent
 389 * the cgroup inode from getting torn down and potentially reallocated a moment
 390 * after page_cgroup_ino() returns, so it only should be used by callers that
 391 * do not care (such as procfs interfaces).
 392 */
 393ino_t page_cgroup_ino(struct page *page)
 394{
 395	struct mem_cgroup *memcg;
 396	unsigned long ino = 0;
 397
 398	rcu_read_lock();
 399	/* page_folio() is racy here, but the entire function is racy anyway */
 400	memcg = folio_memcg_check(page_folio(page));
 401
 402	while (memcg && !(memcg->css.flags & CSS_ONLINE))
 403		memcg = parent_mem_cgroup(memcg);
 404	if (memcg)
 405		ino = cgroup_ino(memcg->css.cgroup);
 406	rcu_read_unlock();
 407	return ino;
 408}
 409
 410static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
 411					 struct mem_cgroup_tree_per_node *mctz,
 412					 unsigned long new_usage_in_excess)
 413{
 414	struct rb_node **p = &mctz->rb_root.rb_node;
 415	struct rb_node *parent = NULL;
 416	struct mem_cgroup_per_node *mz_node;
 417	bool rightmost = true;
 418
 419	if (mz->on_tree)
 420		return;
 421
 422	mz->usage_in_excess = new_usage_in_excess;
 423	if (!mz->usage_in_excess)
 424		return;
 425	while (*p) {
 426		parent = *p;
 427		mz_node = rb_entry(parent, struct mem_cgroup_per_node,
 428					tree_node);
 429		if (mz->usage_in_excess < mz_node->usage_in_excess) {
 430			p = &(*p)->rb_left;
 431			rightmost = false;
 432		} else {
 433			p = &(*p)->rb_right;
 434		}
 435	}
 436
 437	if (rightmost)
 438		mctz->rb_rightmost = &mz->tree_node;
 439
 440	rb_link_node(&mz->tree_node, parent, p);
 441	rb_insert_color(&mz->tree_node, &mctz->rb_root);
 442	mz->on_tree = true;
 443}
 444
 445static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
 446					 struct mem_cgroup_tree_per_node *mctz)
 447{
 448	if (!mz->on_tree)
 449		return;
 450
 451	if (&mz->tree_node == mctz->rb_rightmost)
 452		mctz->rb_rightmost = rb_prev(&mz->tree_node);
 453
 454	rb_erase(&mz->tree_node, &mctz->rb_root);
 455	mz->on_tree = false;
 456}
 457
 458static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
 459				       struct mem_cgroup_tree_per_node *mctz)
 460{
 461	unsigned long flags;
 462
 463	spin_lock_irqsave(&mctz->lock, flags);
 464	__mem_cgroup_remove_exceeded(mz, mctz);
 465	spin_unlock_irqrestore(&mctz->lock, flags);
 466}
 467
 468static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
 469{
 470	unsigned long nr_pages = page_counter_read(&memcg->memory);
 471	unsigned long soft_limit = READ_ONCE(memcg->soft_limit);
 472	unsigned long excess = 0;
 473
 474	if (nr_pages > soft_limit)
 475		excess = nr_pages - soft_limit;
 476
 477	return excess;
 478}
 479
 480static void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid)
 481{
 482	unsigned long excess;
 483	struct mem_cgroup_per_node *mz;
 484	struct mem_cgroup_tree_per_node *mctz;
 485
 486	if (lru_gen_enabled()) {
 487		if (soft_limit_excess(memcg))
 488			lru_gen_soft_reclaim(memcg, nid);
 489		return;
 490	}
 491
 492	mctz = soft_limit_tree.rb_tree_per_node[nid];
 493	if (!mctz)
 494		return;
 495	/*
 496	 * Necessary to update all ancestors when hierarchy is used.
 497	 * because their event counter is not touched.
 498	 */
 499	for (; memcg; memcg = parent_mem_cgroup(memcg)) {
 500		mz = memcg->nodeinfo[nid];
 501		excess = soft_limit_excess(memcg);
 502		/*
 503		 * We have to update the tree if mz is on RB-tree or
 504		 * mem is over its softlimit.
 505		 */
 506		if (excess || mz->on_tree) {
 507			unsigned long flags;
 508
 509			spin_lock_irqsave(&mctz->lock, flags);
 510			/* if on-tree, remove it */
 511			if (mz->on_tree)
 512				__mem_cgroup_remove_exceeded(mz, mctz);
 513			/*
 514			 * Insert again. mz->usage_in_excess will be updated.
 515			 * If excess is 0, no tree ops.
 516			 */
 517			__mem_cgroup_insert_exceeded(mz, mctz, excess);
 518			spin_unlock_irqrestore(&mctz->lock, flags);
 519		}
 520	}
 521}
 522
 523static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
 524{
 525	struct mem_cgroup_tree_per_node *mctz;
 526	struct mem_cgroup_per_node *mz;
 527	int nid;
 528
 529	for_each_node(nid) {
 530		mz = memcg->nodeinfo[nid];
 531		mctz = soft_limit_tree.rb_tree_per_node[nid];
 532		if (mctz)
 533			mem_cgroup_remove_exceeded(mz, mctz);
 534	}
 535}
 536
 537static struct mem_cgroup_per_node *
 538__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
 539{
 540	struct mem_cgroup_per_node *mz;
 541
 542retry:
 543	mz = NULL;
 544	if (!mctz->rb_rightmost)
 545		goto done;		/* Nothing to reclaim from */
 546
 547	mz = rb_entry(mctz->rb_rightmost,
 548		      struct mem_cgroup_per_node, tree_node);
 549	/*
 550	 * Remove the node now but someone else can add it back,
 551	 * we will to add it back at the end of reclaim to its correct
 552	 * position in the tree.
 553	 */
 554	__mem_cgroup_remove_exceeded(mz, mctz);
 555	if (!soft_limit_excess(mz->memcg) ||
 556	    !css_tryget(&mz->memcg->css))
 557		goto retry;
 558done:
 559	return mz;
 560}
 561
 562static struct mem_cgroup_per_node *
 563mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
 564{
 565	struct mem_cgroup_per_node *mz;
 566
 567	spin_lock_irq(&mctz->lock);
 568	mz = __mem_cgroup_largest_soft_limit_node(mctz);
 569	spin_unlock_irq(&mctz->lock);
 570	return mz;
 571}
 572
 573/*
 574 * memcg and lruvec stats flushing
 575 *
 576 * Many codepaths leading to stats update or read are performance sensitive and
 577 * adding stats flushing in such codepaths is not desirable. So, to optimize the
 578 * flushing the kernel does:
 579 *
 580 * 1) Periodically and asynchronously flush the stats every 2 seconds to not let
 581 *    rstat update tree grow unbounded.
 582 *
 583 * 2) Flush the stats synchronously on reader side only when there are more than
 584 *    (MEMCG_CHARGE_BATCH * nr_cpus) update events. Though this optimization
 585 *    will let stats be out of sync by atmost (MEMCG_CHARGE_BATCH * nr_cpus) but
 586 *    only for 2 seconds due to (1).
 587 */
 588static void flush_memcg_stats_dwork(struct work_struct *w);
 589static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork);
 590static DEFINE_PER_CPU(unsigned int, stats_updates);
 591static atomic_t stats_flush_ongoing = ATOMIC_INIT(0);
 592static atomic_t stats_flush_threshold = ATOMIC_INIT(0);
 593static u64 flush_next_time;
 594
 595#define FLUSH_TIME (2UL*HZ)
 596
 597/*
 598 * Accessors to ensure that preemption is disabled on PREEMPT_RT because it can
 599 * not rely on this as part of an acquired spinlock_t lock. These functions are
 600 * never used in hardirq context on PREEMPT_RT and therefore disabling preemtion
 601 * is sufficient.
 602 */
 603static void memcg_stats_lock(void)
 604{
 605	preempt_disable_nested();
 606	VM_WARN_ON_IRQS_ENABLED();
 607}
 608
 609static void __memcg_stats_lock(void)
 610{
 611	preempt_disable_nested();
 612}
 613
 614static void memcg_stats_unlock(void)
 615{
 616	preempt_enable_nested();
 617}
 618
 619static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
 620{
 621	unsigned int x;
 622
 623	if (!val)
 624		return;
 625
 626	cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id());
 627
 628	x = __this_cpu_add_return(stats_updates, abs(val));
 629	if (x > MEMCG_CHARGE_BATCH) {
 630		/*
 631		 * If stats_flush_threshold exceeds the threshold
 632		 * (>num_online_cpus()), cgroup stats update will be triggered
 633		 * in __mem_cgroup_flush_stats(). Increasing this var further
 634		 * is redundant and simply adds overhead in atomic update.
 635		 */
 636		if (atomic_read(&stats_flush_threshold) <= num_online_cpus())
 637			atomic_add(x / MEMCG_CHARGE_BATCH, &stats_flush_threshold);
 638		__this_cpu_write(stats_updates, 0);
 639	}
 640}
 641
 642static void do_flush_stats(void)
 643{
 644	/*
 645	 * We always flush the entire tree, so concurrent flushers can just
 646	 * skip. This avoids a thundering herd problem on the rstat global lock
 647	 * from memcg flushers (e.g. reclaim, refault, etc).
 648	 */
 649	if (atomic_read(&stats_flush_ongoing) ||
 650	    atomic_xchg(&stats_flush_ongoing, 1))
 651		return;
 652
 653	WRITE_ONCE(flush_next_time, jiffies_64 + 2*FLUSH_TIME);
 654
 655	cgroup_rstat_flush(root_mem_cgroup->css.cgroup);
 656
 657	atomic_set(&stats_flush_threshold, 0);
 658	atomic_set(&stats_flush_ongoing, 0);
 659}
 660
 661void mem_cgroup_flush_stats(void)
 662{
 663	if (atomic_read(&stats_flush_threshold) > num_online_cpus())
 664		do_flush_stats();
 665}
 666
 667void mem_cgroup_flush_stats_ratelimited(void)
 668{
 669	if (time_after64(jiffies_64, READ_ONCE(flush_next_time)))
 670		mem_cgroup_flush_stats();
 671}
 672
 673static void flush_memcg_stats_dwork(struct work_struct *w)
 674{
 675	/*
 676	 * Always flush here so that flushing in latency-sensitive paths is
 677	 * as cheap as possible.
 678	 */
 679	do_flush_stats();
 680	queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME);
 681}
 682
 683/* Subset of vm_event_item to report for memcg event stats */
 684static const unsigned int memcg_vm_event_stat[] = {
 685	PGPGIN,
 686	PGPGOUT,
 687	PGSCAN_KSWAPD,
 688	PGSCAN_DIRECT,
 689	PGSCAN_KHUGEPAGED,
 690	PGSTEAL_KSWAPD,
 691	PGSTEAL_DIRECT,
 692	PGSTEAL_KHUGEPAGED,
 693	PGFAULT,
 694	PGMAJFAULT,
 695	PGREFILL,
 696	PGACTIVATE,
 697	PGDEACTIVATE,
 698	PGLAZYFREE,
 699	PGLAZYFREED,
 700#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
 701	ZSWPIN,
 702	ZSWPOUT,
 703#endif
 704#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 705	THP_FAULT_ALLOC,
 706	THP_COLLAPSE_ALLOC,
 707#endif
 708};
 709
 710#define NR_MEMCG_EVENTS ARRAY_SIZE(memcg_vm_event_stat)
 711static int mem_cgroup_events_index[NR_VM_EVENT_ITEMS] __read_mostly;
 712
 713static void init_memcg_events(void)
 714{
 715	int i;
 716
 717	for (i = 0; i < NR_MEMCG_EVENTS; ++i)
 718		mem_cgroup_events_index[memcg_vm_event_stat[i]] = i + 1;
 719}
 720
 721static inline int memcg_events_index(enum vm_event_item idx)
 722{
 723	return mem_cgroup_events_index[idx] - 1;
 724}
 725
 726struct memcg_vmstats_percpu {
 727	/* Local (CPU and cgroup) page state & events */
 728	long			state[MEMCG_NR_STAT];
 729	unsigned long		events[NR_MEMCG_EVENTS];
 730
 731	/* Delta calculation for lockless upward propagation */
 732	long			state_prev[MEMCG_NR_STAT];
 733	unsigned long		events_prev[NR_MEMCG_EVENTS];
 734
 735	/* Cgroup1: threshold notifications & softlimit tree updates */
 736	unsigned long		nr_page_events;
 737	unsigned long		targets[MEM_CGROUP_NTARGETS];
 738};
 739
 740struct memcg_vmstats {
 741	/* Aggregated (CPU and subtree) page state & events */
 742	long			state[MEMCG_NR_STAT];
 743	unsigned long		events[NR_MEMCG_EVENTS];
 744
 745	/* Non-hierarchical (CPU aggregated) page state & events */
 746	long			state_local[MEMCG_NR_STAT];
 747	unsigned long		events_local[NR_MEMCG_EVENTS];
 748
 749	/* Pending child counts during tree propagation */
 750	long			state_pending[MEMCG_NR_STAT];
 751	unsigned long		events_pending[NR_MEMCG_EVENTS];
 752};
 753
 754unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
 755{
 756	long x = READ_ONCE(memcg->vmstats->state[idx]);
 757#ifdef CONFIG_SMP
 758	if (x < 0)
 759		x = 0;
 760#endif
 761	return x;
 762}
 763
 764/**
 765 * __mod_memcg_state - update cgroup memory statistics
 766 * @memcg: the memory cgroup
 767 * @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item
 768 * @val: delta to add to the counter, can be negative
 769 */
 770void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
 771{
 772	if (mem_cgroup_disabled())
 773		return;
 774
 775	__this_cpu_add(memcg->vmstats_percpu->state[idx], val);
 776	memcg_rstat_updated(memcg, val);
 777}
 778
 779/* idx can be of type enum memcg_stat_item or node_stat_item. */
 780static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
 781{
 782	long x = READ_ONCE(memcg->vmstats->state_local[idx]);
 783
 784#ifdef CONFIG_SMP
 785	if (x < 0)
 786		x = 0;
 787#endif
 788	return x;
 789}
 790
 791void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
 792			      int val)
 793{
 794	struct mem_cgroup_per_node *pn;
 795	struct mem_cgroup *memcg;
 796
 797	pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
 798	memcg = pn->memcg;
 799
 800	/*
 801	 * The caller from rmap relay on disabled preemption becase they never
 802	 * update their counter from in-interrupt context. For these two
 803	 * counters we check that the update is never performed from an
 804	 * interrupt context while other caller need to have disabled interrupt.
 805	 */
 806	__memcg_stats_lock();
 807	if (IS_ENABLED(CONFIG_DEBUG_VM)) {
 808		switch (idx) {
 809		case NR_ANON_MAPPED:
 810		case NR_FILE_MAPPED:
 811		case NR_ANON_THPS:
 812		case NR_SHMEM_PMDMAPPED:
 813		case NR_FILE_PMDMAPPED:
 814			WARN_ON_ONCE(!in_task());
 815			break;
 816		default:
 817			VM_WARN_ON_IRQS_ENABLED();
 818		}
 819	}
 820
 821	/* Update memcg */
 822	__this_cpu_add(memcg->vmstats_percpu->state[idx], val);
 823
 824	/* Update lruvec */
 825	__this_cpu_add(pn->lruvec_stats_percpu->state[idx], val);
 826
 827	memcg_rstat_updated(memcg, val);
 828	memcg_stats_unlock();
 829}
 830
 831/**
 832 * __mod_lruvec_state - update lruvec memory statistics
 833 * @lruvec: the lruvec
 834 * @idx: the stat item
 835 * @val: delta to add to the counter, can be negative
 836 *
 837 * The lruvec is the intersection of the NUMA node and a cgroup. This
 838 * function updates the all three counters that are affected by a
 839 * change of state at this level: per-node, per-cgroup, per-lruvec.
 840 */
 841void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
 842			int val)
 843{
 844	/* Update node */
 845	__mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
 846
 847	/* Update memcg and lruvec */
 848	if (!mem_cgroup_disabled())
 849		__mod_memcg_lruvec_state(lruvec, idx, val);
 850}
 851
 852void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx,
 853			     int val)
 854{
 855	struct page *head = compound_head(page); /* rmap on tail pages */
 856	struct mem_cgroup *memcg;
 857	pg_data_t *pgdat = page_pgdat(page);
 858	struct lruvec *lruvec;
 859
 860	rcu_read_lock();
 861	memcg = page_memcg(head);
 862	/* Untracked pages have no memcg, no lruvec. Update only the node */
 863	if (!memcg) {
 864		rcu_read_unlock();
 865		__mod_node_page_state(pgdat, idx, val);
 866		return;
 867	}
 868
 869	lruvec = mem_cgroup_lruvec(memcg, pgdat);
 870	__mod_lruvec_state(lruvec, idx, val);
 871	rcu_read_unlock();
 872}
 873EXPORT_SYMBOL(__mod_lruvec_page_state);
 874
 875void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val)
 876{
 877	pg_data_t *pgdat = page_pgdat(virt_to_page(p));
 878	struct mem_cgroup *memcg;
 879	struct lruvec *lruvec;
 880
 881	rcu_read_lock();
 882	memcg = mem_cgroup_from_slab_obj(p);
 883
 884	/*
 885	 * Untracked pages have no memcg, no lruvec. Update only the
 886	 * node. If we reparent the slab objects to the root memcg,
 887	 * when we free the slab object, we need to update the per-memcg
 888	 * vmstats to keep it correct for the root memcg.
 889	 */
 890	if (!memcg) {
 891		__mod_node_page_state(pgdat, idx, val);
 892	} else {
 893		lruvec = mem_cgroup_lruvec(memcg, pgdat);
 894		__mod_lruvec_state(lruvec, idx, val);
 895	}
 896	rcu_read_unlock();
 897}
 898
 899/**
 900 * __count_memcg_events - account VM events in a cgroup
 901 * @memcg: the memory cgroup
 902 * @idx: the event item
 903 * @count: the number of events that occurred
 904 */
 905void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
 906			  unsigned long count)
 907{
 908	int index = memcg_events_index(idx);
 909
 910	if (mem_cgroup_disabled() || index < 0)
 911		return;
 912
 913	memcg_stats_lock();
 914	__this_cpu_add(memcg->vmstats_percpu->events[index], count);
 915	memcg_rstat_updated(memcg, count);
 916	memcg_stats_unlock();
 917}
 918
 919static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
 920{
 921	int index = memcg_events_index(event);
 922
 923	if (index < 0)
 924		return 0;
 925	return READ_ONCE(memcg->vmstats->events[index]);
 926}
 927
 928static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
 929{
 930	int index = memcg_events_index(event);
 931
 932	if (index < 0)
 933		return 0;
 934
 935	return READ_ONCE(memcg->vmstats->events_local[index]);
 936}
 937
 938static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
 939					 int nr_pages)
 940{
 941	/* pagein of a big page is an event. So, ignore page size */
 942	if (nr_pages > 0)
 943		__count_memcg_events(memcg, PGPGIN, 1);
 944	else {
 945		__count_memcg_events(memcg, PGPGOUT, 1);
 946		nr_pages = -nr_pages; /* for event */
 947	}
 948
 949	__this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages);
 950}
 951
 952static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
 953				       enum mem_cgroup_events_target target)
 954{
 955	unsigned long val, next;
 956
 957	val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events);
 958	next = __this_cpu_read(memcg->vmstats_percpu->targets[target]);
 959	/* from time_after() in jiffies.h */
 960	if ((long)(next - val) < 0) {
 961		switch (target) {
 962		case MEM_CGROUP_TARGET_THRESH:
 963			next = val + THRESHOLDS_EVENTS_TARGET;
 964			break;
 965		case MEM_CGROUP_TARGET_SOFTLIMIT:
 966			next = val + SOFTLIMIT_EVENTS_TARGET;
 967			break;
 968		default:
 969			break;
 970		}
 971		__this_cpu_write(memcg->vmstats_percpu->targets[target], next);
 972		return true;
 973	}
 974	return false;
 975}
 976
 977/*
 978 * Check events in order.
 979 *
 980 */
 981static void memcg_check_events(struct mem_cgroup *memcg, int nid)
 982{
 983	if (IS_ENABLED(CONFIG_PREEMPT_RT))
 984		return;
 985
 986	/* threshold event is triggered in finer grain than soft limit */
 987	if (unlikely(mem_cgroup_event_ratelimit(memcg,
 988						MEM_CGROUP_TARGET_THRESH))) {
 989		bool do_softlimit;
 990
 991		do_softlimit = mem_cgroup_event_ratelimit(memcg,
 992						MEM_CGROUP_TARGET_SOFTLIMIT);
 993		mem_cgroup_threshold(memcg);
 994		if (unlikely(do_softlimit))
 995			mem_cgroup_update_tree(memcg, nid);
 996	}
 997}
 998
 999struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
1000{
1001	/*
1002	 * mm_update_next_owner() may clear mm->owner to NULL
1003	 * if it races with swapoff, page migration, etc.
1004	 * So this can be called with p == NULL.
1005	 */
1006	if (unlikely(!p))
1007		return NULL;
1008
1009	return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
1010}
1011EXPORT_SYMBOL(mem_cgroup_from_task);
1012
1013static __always_inline struct mem_cgroup *active_memcg(void)
1014{
1015	if (!in_task())
1016		return this_cpu_read(int_active_memcg);
1017	else
1018		return current->active_memcg;
1019}
1020
1021/**
1022 * get_mem_cgroup_from_mm: Obtain a reference on given mm_struct's memcg.
1023 * @mm: mm from which memcg should be extracted. It can be NULL.
1024 *
1025 * Obtain a reference on mm->memcg and returns it if successful. If mm
1026 * is NULL, then the memcg is chosen as follows:
1027 * 1) The active memcg, if set.
1028 * 2) current->mm->memcg, if available
1029 * 3) root memcg
1030 * If mem_cgroup is disabled, NULL is returned.
1031 */
1032struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
1033{
1034	struct mem_cgroup *memcg;
1035
1036	if (mem_cgroup_disabled())
1037		return NULL;
1038
1039	/*
1040	 * Page cache insertions can happen without an
1041	 * actual mm context, e.g. during disk probing
1042	 * on boot, loopback IO, acct() writes etc.
1043	 *
1044	 * No need to css_get on root memcg as the reference
1045	 * counting is disabled on the root level in the
1046	 * cgroup core. See CSS_NO_REF.
1047	 */
1048	if (unlikely(!mm)) {
1049		memcg = active_memcg();
1050		if (unlikely(memcg)) {
1051			/* remote memcg must hold a ref */
1052			css_get(&memcg->css);
1053			return memcg;
1054		}
1055		mm = current->mm;
1056		if (unlikely(!mm))
1057			return root_mem_cgroup;
1058	}
1059
1060	rcu_read_lock();
1061	do {
1062		memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
1063		if (unlikely(!memcg))
1064			memcg = root_mem_cgroup;
1065	} while (!css_tryget(&memcg->css));
1066	rcu_read_unlock();
1067	return memcg;
1068}
1069EXPORT_SYMBOL(get_mem_cgroup_from_mm);
1070
1071static __always_inline bool memcg_kmem_bypass(void)
1072{
1073	/* Allow remote memcg charging from any context. */
1074	if (unlikely(active_memcg()))
1075		return false;
1076
1077	/* Memcg to charge can't be determined. */
1078	if (!in_task() || !current->mm || (current->flags & PF_KTHREAD))
1079		return true;
1080
1081	return false;
1082}
1083
1084/**
1085 * mem_cgroup_iter - iterate over memory cgroup hierarchy
1086 * @root: hierarchy root
1087 * @prev: previously returned memcg, NULL on first invocation
1088 * @reclaim: cookie for shared reclaim walks, NULL for full walks
1089 *
1090 * Returns references to children of the hierarchy below @root, or
1091 * @root itself, or %NULL after a full round-trip.
1092 *
1093 * Caller must pass the return value in @prev on subsequent
1094 * invocations for reference counting, or use mem_cgroup_iter_break()
1095 * to cancel a hierarchy walk before the round-trip is complete.
1096 *
1097 * Reclaimers can specify a node in @reclaim to divide up the memcgs
1098 * in the hierarchy among all concurrent reclaimers operating on the
1099 * same node.
1100 */
1101struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1102				   struct mem_cgroup *prev,
1103				   struct mem_cgroup_reclaim_cookie *reclaim)
1104{
1105	struct mem_cgroup_reclaim_iter *iter;
1106	struct cgroup_subsys_state *css = NULL;
1107	struct mem_cgroup *memcg = NULL;
1108	struct mem_cgroup *pos = NULL;
1109
1110	if (mem_cgroup_disabled())
1111		return NULL;
1112
1113	if (!root)
1114		root = root_mem_cgroup;
1115
1116	rcu_read_lock();
1117
1118	if (reclaim) {
1119		struct mem_cgroup_per_node *mz;
1120
1121		mz = root->nodeinfo[reclaim->pgdat->node_id];
1122		iter = &mz->iter;
1123
1124		/*
1125		 * On start, join the current reclaim iteration cycle.
1126		 * Exit when a concurrent walker completes it.
1127		 */
1128		if (!prev)
1129			reclaim->generation = iter->generation;
1130		else if (reclaim->generation != iter->generation)
1131			goto out_unlock;
1132
1133		while (1) {
1134			pos = READ_ONCE(iter->position);
1135			if (!pos || css_tryget(&pos->css))
1136				break;
1137			/*
1138			 * css reference reached zero, so iter->position will
1139			 * be cleared by ->css_released. However, we should not
1140			 * rely on this happening soon, because ->css_released
1141			 * is called from a work queue, and by busy-waiting we
1142			 * might block it. So we clear iter->position right
1143			 * away.
1144			 */
1145			(void)cmpxchg(&iter->position, pos, NULL);
1146		}
1147	} else if (prev) {
1148		pos = prev;
1149	}
1150
1151	if (pos)
1152		css = &pos->css;
1153
1154	for (;;) {
1155		css = css_next_descendant_pre(css, &root->css);
1156		if (!css) {
1157			/*
1158			 * Reclaimers share the hierarchy walk, and a
1159			 * new one might jump in right at the end of
1160			 * the hierarchy - make sure they see at least
1161			 * one group and restart from the beginning.
1162			 */
1163			if (!prev)
1164				continue;
1165			break;
1166		}
1167
1168		/*
1169		 * Verify the css and acquire a reference.  The root
1170		 * is provided by the caller, so we know it's alive
1171		 * and kicking, and don't take an extra reference.
1172		 */
1173		if (css == &root->css || css_tryget(css)) {
1174			memcg = mem_cgroup_from_css(css);
1175			break;
1176		}
1177	}
1178
1179	if (reclaim) {
1180		/*
1181		 * The position could have already been updated by a competing
1182		 * thread, so check that the value hasn't changed since we read
1183		 * it to avoid reclaiming from the same cgroup twice.
1184		 */
1185		(void)cmpxchg(&iter->position, pos, memcg);
1186
1187		if (pos)
1188			css_put(&pos->css);
1189
1190		if (!memcg)
1191			iter->generation++;
1192	}
1193
1194out_unlock:
1195	rcu_read_unlock();
1196	if (prev && prev != root)
1197		css_put(&prev->css);
1198
1199	return memcg;
1200}
1201
1202/**
1203 * mem_cgroup_iter_break - abort a hierarchy walk prematurely
1204 * @root: hierarchy root
1205 * @prev: last visited hierarchy member as returned by mem_cgroup_iter()
1206 */
1207void mem_cgroup_iter_break(struct mem_cgroup *root,
1208			   struct mem_cgroup *prev)
1209{
1210	if (!root)
1211		root = root_mem_cgroup;
1212	if (prev && prev != root)
1213		css_put(&prev->css);
1214}
1215
1216static void __invalidate_reclaim_iterators(struct mem_cgroup *from,
1217					struct mem_cgroup *dead_memcg)
1218{
1219	struct mem_cgroup_reclaim_iter *iter;
1220	struct mem_cgroup_per_node *mz;
1221	int nid;
1222
1223	for_each_node(nid) {
1224		mz = from->nodeinfo[nid];
1225		iter = &mz->iter;
1226		cmpxchg(&iter->position, dead_memcg, NULL);
1227	}
1228}
1229
1230static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
1231{
1232	struct mem_cgroup *memcg = dead_memcg;
1233	struct mem_cgroup *last;
1234
1235	do {
1236		__invalidate_reclaim_iterators(memcg, dead_memcg);
1237		last = memcg;
1238	} while ((memcg = parent_mem_cgroup(memcg)));
1239
1240	/*
1241	 * When cgroup1 non-hierarchy mode is used,
1242	 * parent_mem_cgroup() does not walk all the way up to the
1243	 * cgroup root (root_mem_cgroup). So we have to handle
1244	 * dead_memcg from cgroup root separately.
1245	 */
1246	if (!mem_cgroup_is_root(last))
1247		__invalidate_reclaim_iterators(root_mem_cgroup,
1248						dead_memcg);
1249}
1250
1251/**
1252 * mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy
1253 * @memcg: hierarchy root
1254 * @fn: function to call for each task
1255 * @arg: argument passed to @fn
1256 *
1257 * This function iterates over tasks attached to @memcg or to any of its
1258 * descendants and calls @fn for each task. If @fn returns a non-zero
1259 * value, the function breaks the iteration loop. Otherwise, it will iterate
1260 * over all tasks and return 0.
1261 *
1262 * This function must not be called for the root memory cgroup.
1263 */
1264void mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
1265			   int (*fn)(struct task_struct *, void *), void *arg)
1266{
1267	struct mem_cgroup *iter;
1268	int ret = 0;
1269
1270	BUG_ON(mem_cgroup_is_root(memcg));
1271
1272	for_each_mem_cgroup_tree(iter, memcg) {
1273		struct css_task_iter it;
1274		struct task_struct *task;
1275
1276		css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it);
1277		while (!ret && (task = css_task_iter_next(&it)))
1278			ret = fn(task, arg);
1279		css_task_iter_end(&it);
1280		if (ret) {
1281			mem_cgroup_iter_break(memcg, iter);
1282			break;
1283		}
1284	}
1285}
1286
1287#ifdef CONFIG_DEBUG_VM
1288void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio)
1289{
1290	struct mem_cgroup *memcg;
1291
1292	if (mem_cgroup_disabled())
1293		return;
1294
1295	memcg = folio_memcg(folio);
1296
1297	if (!memcg)
1298		VM_BUG_ON_FOLIO(!mem_cgroup_is_root(lruvec_memcg(lruvec)), folio);
1299	else
1300		VM_BUG_ON_FOLIO(lruvec_memcg(lruvec) != memcg, folio);
1301}
1302#endif
1303
1304/**
1305 * folio_lruvec_lock - Lock the lruvec for a folio.
1306 * @folio: Pointer to the folio.
1307 *
1308 * These functions are safe to use under any of the following conditions:
1309 * - folio locked
1310 * - folio_test_lru false
1311 * - folio_memcg_lock()
1312 * - folio frozen (refcount of 0)
1313 *
1314 * Return: The lruvec this folio is on with its lock held.
1315 */
1316struct lruvec *folio_lruvec_lock(struct folio *folio)
1317{
1318	struct lruvec *lruvec = folio_lruvec(folio);
1319
1320	spin_lock(&lruvec->lru_lock);
1321	lruvec_memcg_debug(lruvec, folio);
1322
1323	return lruvec;
1324}
1325
1326/**
1327 * folio_lruvec_lock_irq - Lock the lruvec for a folio.
1328 * @folio: Pointer to the folio.
1329 *
1330 * These functions are safe to use under any of the following conditions:
1331 * - folio locked
1332 * - folio_test_lru false
1333 * - folio_memcg_lock()
1334 * - folio frozen (refcount of 0)
1335 *
1336 * Return: The lruvec this folio is on with its lock held and interrupts
1337 * disabled.
1338 */
1339struct lruvec *folio_lruvec_lock_irq(struct folio *folio)
1340{
1341	struct lruvec *lruvec = folio_lruvec(folio);
1342
1343	spin_lock_irq(&lruvec->lru_lock);
1344	lruvec_memcg_debug(lruvec, folio);
1345
1346	return lruvec;
1347}
1348
1349/**
1350 * folio_lruvec_lock_irqsave - Lock the lruvec for a folio.
1351 * @folio: Pointer to the folio.
1352 * @flags: Pointer to irqsave flags.
1353 *
1354 * These functions are safe to use under any of the following conditions:
1355 * - folio locked
1356 * - folio_test_lru false
1357 * - folio_memcg_lock()
1358 * - folio frozen (refcount of 0)
1359 *
1360 * Return: The lruvec this folio is on with its lock held and interrupts
1361 * disabled.
1362 */
1363struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio,
1364		unsigned long *flags)
1365{
1366	struct lruvec *lruvec = folio_lruvec(folio);
1367
1368	spin_lock_irqsave(&lruvec->lru_lock, *flags);
1369	lruvec_memcg_debug(lruvec, folio);
1370
1371	return lruvec;
1372}
1373
1374/**
1375 * mem_cgroup_update_lru_size - account for adding or removing an lru page
1376 * @lruvec: mem_cgroup per zone lru vector
1377 * @lru: index of lru list the page is sitting on
1378 * @zid: zone id of the accounted pages
1379 * @nr_pages: positive when adding or negative when removing
1380 *
1381 * This function must be called under lru_lock, just before a page is added
1382 * to or just after a page is removed from an lru list.
1383 */
1384void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
1385				int zid, int nr_pages)
1386{
1387	struct mem_cgroup_per_node *mz;
1388	unsigned long *lru_size;
1389	long size;
1390
1391	if (mem_cgroup_disabled())
1392		return;
1393
1394	mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
1395	lru_size = &mz->lru_zone_size[zid][lru];
1396
1397	if (nr_pages < 0)
1398		*lru_size += nr_pages;
1399
1400	size = *lru_size;
1401	if (WARN_ONCE(size < 0,
1402		"%s(%p, %d, %d): lru_size %ld\n",
1403		__func__, lruvec, lru, nr_pages, size)) {
1404		VM_BUG_ON(1);
1405		*lru_size = 0;
1406	}
1407
1408	if (nr_pages > 0)
1409		*lru_size += nr_pages;
1410}
1411
1412/**
1413 * mem_cgroup_margin - calculate chargeable space of a memory cgroup
1414 * @memcg: the memory cgroup
1415 *
1416 * Returns the maximum amount of memory @mem can be charged with, in
1417 * pages.
1418 */
1419static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
1420{
1421	unsigned long margin = 0;
1422	unsigned long count;
1423	unsigned long limit;
1424
1425	count = page_counter_read(&memcg->memory);
1426	limit = READ_ONCE(memcg->memory.max);
1427	if (count < limit)
1428		margin = limit - count;
1429
1430	if (do_memsw_account()) {
1431		count = page_counter_read(&memcg->memsw);
1432		limit = READ_ONCE(memcg->memsw.max);
1433		if (count < limit)
1434			margin = min(margin, limit - count);
1435		else
1436			margin = 0;
1437	}
1438
1439	return margin;
1440}
1441
1442/*
1443 * A routine for checking "mem" is under move_account() or not.
1444 *
1445 * Checking a cgroup is mc.from or mc.to or under hierarchy of
1446 * moving cgroups. This is for waiting at high-memory pressure
1447 * caused by "move".
1448 */
1449static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
1450{
1451	struct mem_cgroup *from;
1452	struct mem_cgroup *to;
1453	bool ret = false;
1454	/*
1455	 * Unlike task_move routines, we access mc.to, mc.from not under
1456	 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
1457	 */
1458	spin_lock(&mc.lock);
1459	from = mc.from;
1460	to = mc.to;
1461	if (!from)
1462		goto unlock;
1463
1464	ret = mem_cgroup_is_descendant(from, memcg) ||
1465		mem_cgroup_is_descendant(to, memcg);
1466unlock:
1467	spin_unlock(&mc.lock);
1468	return ret;
1469}
1470
1471static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
1472{
1473	if (mc.moving_task && current != mc.moving_task) {
1474		if (mem_cgroup_under_move(memcg)) {
1475			DEFINE_WAIT(wait);
1476			prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
1477			/* moving charge context might have finished. */
1478			if (mc.moving_task)
1479				schedule();
1480			finish_wait(&mc.waitq, &wait);
1481			return true;
1482		}
1483	}
1484	return false;
1485}
1486
1487struct memory_stat {
1488	const char *name;
1489	unsigned int idx;
1490};
1491
1492static const struct memory_stat memory_stats[] = {
1493	{ "anon",			NR_ANON_MAPPED			},
1494	{ "file",			NR_FILE_PAGES			},
1495	{ "kernel",			MEMCG_KMEM			},
1496	{ "kernel_stack",		NR_KERNEL_STACK_KB		},
1497	{ "pagetables",			NR_PAGETABLE			},
1498	{ "sec_pagetables",		NR_SECONDARY_PAGETABLE		},
1499	{ "percpu",			MEMCG_PERCPU_B			},
1500	{ "sock",			MEMCG_SOCK			},
1501	{ "vmalloc",			MEMCG_VMALLOC			},
1502	{ "shmem",			NR_SHMEM			},
1503#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
1504	{ "zswap",			MEMCG_ZSWAP_B			},
1505	{ "zswapped",			MEMCG_ZSWAPPED			},
1506#endif
1507	{ "file_mapped",		NR_FILE_MAPPED			},
1508	{ "file_dirty",			NR_FILE_DIRTY			},
1509	{ "file_writeback",		NR_WRITEBACK			},
1510#ifdef CONFIG_SWAP
1511	{ "swapcached",			NR_SWAPCACHE			},
1512#endif
1513#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1514	{ "anon_thp",			NR_ANON_THPS			},
1515	{ "file_thp",			NR_FILE_THPS			},
1516	{ "shmem_thp",			NR_SHMEM_THPS			},
1517#endif
1518	{ "inactive_anon",		NR_INACTIVE_ANON		},
1519	{ "active_anon",		NR_ACTIVE_ANON			},
1520	{ "inactive_file",		NR_INACTIVE_FILE		},
1521	{ "active_file",		NR_ACTIVE_FILE			},
1522	{ "unevictable",		NR_UNEVICTABLE			},
1523	{ "slab_reclaimable",		NR_SLAB_RECLAIMABLE_B		},
1524	{ "slab_unreclaimable",		NR_SLAB_UNRECLAIMABLE_B		},
1525
1526	/* The memory events */
1527	{ "workingset_refault_anon",	WORKINGSET_REFAULT_ANON		},
1528	{ "workingset_refault_file",	WORKINGSET_REFAULT_FILE		},
1529	{ "workingset_activate_anon",	WORKINGSET_ACTIVATE_ANON	},
1530	{ "workingset_activate_file",	WORKINGSET_ACTIVATE_FILE	},
1531	{ "workingset_restore_anon",	WORKINGSET_RESTORE_ANON		},
1532	{ "workingset_restore_file",	WORKINGSET_RESTORE_FILE		},
1533	{ "workingset_nodereclaim",	WORKINGSET_NODERECLAIM		},
1534};
1535
1536/* Translate stat items to the correct unit for memory.stat output */
1537static int memcg_page_state_unit(int item)
1538{
1539	switch (item) {
1540	case MEMCG_PERCPU_B:
1541	case MEMCG_ZSWAP_B:
1542	case NR_SLAB_RECLAIMABLE_B:
1543	case NR_SLAB_UNRECLAIMABLE_B:
1544	case WORKINGSET_REFAULT_ANON:
1545	case WORKINGSET_REFAULT_FILE:
1546	case WORKINGSET_ACTIVATE_ANON:
1547	case WORKINGSET_ACTIVATE_FILE:
1548	case WORKINGSET_RESTORE_ANON:
1549	case WORKINGSET_RESTORE_FILE:
1550	case WORKINGSET_NODERECLAIM:
1551		return 1;
1552	case NR_KERNEL_STACK_KB:
1553		return SZ_1K;
1554	default:
1555		return PAGE_SIZE;
1556	}
1557}
1558
1559static inline unsigned long memcg_page_state_output(struct mem_cgroup *memcg,
1560						    int item)
1561{
1562	return memcg_page_state(memcg, item) * memcg_page_state_unit(item);
1563}
1564
1565static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
1566{
1567	int i;
1568
1569	/*
1570	 * Provide statistics on the state of the memory subsystem as
1571	 * well as cumulative event counters that show past behavior.
1572	 *
1573	 * This list is ordered following a combination of these gradients:
1574	 * 1) generic big picture -> specifics and details
1575	 * 2) reflecting userspace activity -> reflecting kernel heuristics
1576	 *
1577	 * Current memory state:
1578	 */
1579	mem_cgroup_flush_stats();
1580
1581	for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
1582		u64 size;
1583
1584		size = memcg_page_state_output(memcg, memory_stats[i].idx);
1585		seq_buf_printf(s, "%s %llu\n", memory_stats[i].name, size);
1586
1587		if (unlikely(memory_stats[i].idx == NR_SLAB_UNRECLAIMABLE_B)) {
1588			size += memcg_page_state_output(memcg,
1589							NR_SLAB_RECLAIMABLE_B);
1590			seq_buf_printf(s, "slab %llu\n", size);
1591		}
1592	}
1593
1594	/* Accumulated memory events */
1595	seq_buf_printf(s, "pgscan %lu\n",
1596		       memcg_events(memcg, PGSCAN_KSWAPD) +
1597		       memcg_events(memcg, PGSCAN_DIRECT) +
1598		       memcg_events(memcg, PGSCAN_KHUGEPAGED));
1599	seq_buf_printf(s, "pgsteal %lu\n",
1600		       memcg_events(memcg, PGSTEAL_KSWAPD) +
1601		       memcg_events(memcg, PGSTEAL_DIRECT) +
1602		       memcg_events(memcg, PGSTEAL_KHUGEPAGED));
1603
1604	for (i = 0; i < ARRAY_SIZE(memcg_vm_event_stat); i++) {
1605		if (memcg_vm_event_stat[i] == PGPGIN ||
1606		    memcg_vm_event_stat[i] == PGPGOUT)
1607			continue;
1608
1609		seq_buf_printf(s, "%s %lu\n",
1610			       vm_event_name(memcg_vm_event_stat[i]),
1611			       memcg_events(memcg, memcg_vm_event_stat[i]));
1612	}
1613
1614	/* The above should easily fit into one page */
1615	WARN_ON_ONCE(seq_buf_has_overflowed(s));
1616}
1617
1618static void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s);
1619
1620static void memory_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
1621{
1622	if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
1623		memcg_stat_format(memcg, s);
1624	else
1625		memcg1_stat_format(memcg, s);
1626	WARN_ON_ONCE(seq_buf_has_overflowed(s));
1627}
1628
1629/**
1630 * mem_cgroup_print_oom_context: Print OOM information relevant to
1631 * memory controller.
1632 * @memcg: The memory cgroup that went over limit
1633 * @p: Task that is going to be killed
1634 *
1635 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
1636 * enabled
1637 */
1638void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p)
1639{
1640	rcu_read_lock();
1641
1642	if (memcg) {
1643		pr_cont(",oom_memcg=");
1644		pr_cont_cgroup_path(memcg->css.cgroup);
1645	} else
1646		pr_cont(",global_oom");
1647	if (p) {
1648		pr_cont(",task_memcg=");
1649		pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
1650	}
1651	rcu_read_unlock();
1652}
1653
1654/**
1655 * mem_cgroup_print_oom_meminfo: Print OOM memory information relevant to
1656 * memory controller.
1657 * @memcg: The memory cgroup that went over limit
1658 */
1659void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
1660{
1661	/* Use static buffer, for the caller is holding oom_lock. */
1662	static char buf[PAGE_SIZE];
1663	struct seq_buf s;
1664
1665	lockdep_assert_held(&oom_lock);
1666
1667	pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
1668		K((u64)page_counter_read(&memcg->memory)),
1669		K((u64)READ_ONCE(memcg->memory.max)), memcg->memory.failcnt);
1670	if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
1671		pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n",
1672			K((u64)page_counter_read(&memcg->swap)),
1673			K((u64)READ_ONCE(memcg->swap.max)), memcg->swap.failcnt);
1674	else {
1675		pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
1676			K((u64)page_counter_read(&memcg->memsw)),
1677			K((u64)memcg->memsw.max), memcg->memsw.failcnt);
1678		pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
1679			K((u64)page_counter_read(&memcg->kmem)),
1680			K((u64)memcg->kmem.max), memcg->kmem.failcnt);
1681	}
1682
1683	pr_info("Memory cgroup stats for ");
1684	pr_cont_cgroup_path(memcg->css.cgroup);
1685	pr_cont(":");
1686	seq_buf_init(&s, buf, sizeof(buf));
1687	memory_stat_format(memcg, &s);
1688	seq_buf_do_printk(&s, KERN_INFO);
1689}
1690
1691/*
1692 * Return the memory (and swap, if configured) limit for a memcg.
1693 */
1694unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
1695{
1696	unsigned long max = READ_ONCE(memcg->memory.max);
1697
1698	if (do_memsw_account()) {
1699		if (mem_cgroup_swappiness(memcg)) {
1700			/* Calculate swap excess capacity from memsw limit */
1701			unsigned long swap = READ_ONCE(memcg->memsw.max) - max;
1702
1703			max += min(swap, (unsigned long)total_swap_pages);
1704		}
1705	} else {
1706		if (mem_cgroup_swappiness(memcg))
1707			max += min(READ_ONCE(memcg->swap.max),
1708				   (unsigned long)total_swap_pages);
1709	}
1710	return max;
1711}
1712
1713unsigned long mem_cgroup_size(struct mem_cgroup *memcg)
1714{
1715	return page_counter_read(&memcg->memory);
1716}
1717
1718static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1719				     int order)
1720{
1721	struct oom_control oc = {
1722		.zonelist = NULL,
1723		.nodemask = NULL,
1724		.memcg = memcg,
1725		.gfp_mask = gfp_mask,
1726		.order = order,
1727	};
1728	bool ret = true;
1729
1730	if (mutex_lock_killable(&oom_lock))
1731		return true;
1732
1733	if (mem_cgroup_margin(memcg) >= (1 << order))
1734		goto unlock;
1735
1736	/*
1737	 * A few threads which were not waiting at mutex_lock_killable() can
1738	 * fail to bail out. Therefore, check again after holding oom_lock.
1739	 */
1740	ret = task_is_dying() || out_of_memory(&oc);
1741
1742unlock:
1743	mutex_unlock(&oom_lock);
1744	return ret;
1745}
1746
1747static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
1748				   pg_data_t *pgdat,
1749				   gfp_t gfp_mask,
1750				   unsigned long *total_scanned)
1751{
1752	struct mem_cgroup *victim = NULL;
1753	int total = 0;
1754	int loop = 0;
1755	unsigned long excess;
1756	unsigned long nr_scanned;
1757	struct mem_cgroup_reclaim_cookie reclaim = {
1758		.pgdat = pgdat,
1759	};
1760
1761	excess = soft_limit_excess(root_memcg);
1762
1763	while (1) {
1764		victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
1765		if (!victim) {
1766			loop++;
1767			if (loop >= 2) {
1768				/*
1769				 * If we have not been able to reclaim
1770				 * anything, it might because there are
1771				 * no reclaimable pages under this hierarchy
1772				 */
1773				if (!total)
1774					break;
1775				/*
1776				 * We want to do more targeted reclaim.
1777				 * excess >> 2 is not to excessive so as to
1778				 * reclaim too much, nor too less that we keep
1779				 * coming back to reclaim from this cgroup
1780				 */
1781				if (total >= (excess >> 2) ||
1782					(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
1783					break;
1784			}
1785			continue;
1786		}
1787		total += mem_cgroup_shrink_node(victim, gfp_mask, false,
1788					pgdat, &nr_scanned);
1789		*total_scanned += nr_scanned;
1790		if (!soft_limit_excess(root_memcg))
1791			break;
1792	}
1793	mem_cgroup_iter_break(root_memcg, victim);
1794	return total;
1795}
1796
1797#ifdef CONFIG_LOCKDEP
1798static struct lockdep_map memcg_oom_lock_dep_map = {
1799	.name = "memcg_oom_lock",
1800};
1801#endif
1802
1803static DEFINE_SPINLOCK(memcg_oom_lock);
1804
1805/*
1806 * Check OOM-Killer is already running under our hierarchy.
1807 * If someone is running, return false.
1808 */
1809static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
1810{
1811	struct mem_cgroup *iter, *failed = NULL;
1812
1813	spin_lock(&memcg_oom_lock);
1814
1815	for_each_mem_cgroup_tree(iter, memcg) {
1816		if (iter->oom_lock) {
1817			/*
1818			 * this subtree of our hierarchy is already locked
1819			 * so we cannot give a lock.
1820			 */
1821			failed = iter;
1822			mem_cgroup_iter_break(memcg, iter);
1823			break;
1824		} else
1825			iter->oom_lock = true;
1826	}
1827
1828	if (failed) {
1829		/*
1830		 * OK, we failed to lock the whole subtree so we have
1831		 * to clean up what we set up to the failing subtree
1832		 */
1833		for_each_mem_cgroup_tree(iter, memcg) {
1834			if (iter == failed) {
1835				mem_cgroup_iter_break(memcg, iter);
1836				break;
1837			}
1838			iter->oom_lock = false;
1839		}
1840	} else
1841		mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
1842
1843	spin_unlock(&memcg_oom_lock);
1844
1845	return !failed;
1846}
1847
1848static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
1849{
1850	struct mem_cgroup *iter;
1851
1852	spin_lock(&memcg_oom_lock);
1853	mutex_release(&memcg_oom_lock_dep_map, _RET_IP_);
1854	for_each_mem_cgroup_tree(iter, memcg)
1855		iter->oom_lock = false;
1856	spin_unlock(&memcg_oom_lock);
1857}
1858
1859static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
1860{
1861	struct mem_cgroup *iter;
1862
1863	spin_lock(&memcg_oom_lock);
1864	for_each_mem_cgroup_tree(iter, memcg)
1865		iter->under_oom++;
1866	spin_unlock(&memcg_oom_lock);
1867}
1868
1869static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
1870{
1871	struct mem_cgroup *iter;
1872
1873	/*
1874	 * Be careful about under_oom underflows because a child memcg
1875	 * could have been added after mem_cgroup_mark_under_oom.
1876	 */
1877	spin_lock(&memcg_oom_lock);
1878	for_each_mem_cgroup_tree(iter, memcg)
1879		if (iter->under_oom > 0)
1880			iter->under_oom--;
1881	spin_unlock(&memcg_oom_lock);
1882}
1883
1884static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
1885
1886struct oom_wait_info {
1887	struct mem_cgroup *memcg;
1888	wait_queue_entry_t	wait;
1889};
1890
1891static int memcg_oom_wake_function(wait_queue_entry_t *wait,
1892	unsigned mode, int sync, void *arg)
1893{
1894	struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
1895	struct mem_cgroup *oom_wait_memcg;
1896	struct oom_wait_info *oom_wait_info;
1897
1898	oom_wait_info = container_of(wait, struct oom_wait_info, wait);
1899	oom_wait_memcg = oom_wait_info->memcg;
1900
1901	if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) &&
1902	    !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg))
1903		return 0;
1904	return autoremove_wake_function(wait, mode, sync, arg);
1905}
1906
1907static void memcg_oom_recover(struct mem_cgroup *memcg)
1908{
1909	/*
1910	 * For the following lockless ->under_oom test, the only required
1911	 * guarantee is that it must see the state asserted by an OOM when
1912	 * this function is called as a result of userland actions
1913	 * triggered by the notification of the OOM.  This is trivially
1914	 * achieved by invoking mem_cgroup_mark_under_oom() before
1915	 * triggering notification.
1916	 */
1917	if (memcg && memcg->under_oom)
1918		__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
1919}
1920
1921/*
1922 * Returns true if successfully killed one or more processes. Though in some
1923 * corner cases it can return true even without killing any process.
1924 */
1925static bool mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
1926{
1927	bool locked, ret;
1928
1929	if (order > PAGE_ALLOC_COSTLY_ORDER)
1930		return false;
1931
1932	memcg_memory_event(memcg, MEMCG_OOM);
1933
1934	/*
1935	 * We are in the middle of the charge context here, so we
1936	 * don't want to block when potentially sitting on a callstack
1937	 * that holds all kinds of filesystem and mm locks.
1938	 *
1939	 * cgroup1 allows disabling the OOM killer and waiting for outside
1940	 * handling until the charge can succeed; remember the context and put
1941	 * the task to sleep at the end of the page fault when all locks are
1942	 * released.
1943	 *
1944	 * On the other hand, in-kernel OOM killer allows for an async victim
1945	 * memory reclaim (oom_reaper) and that means that we are not solely
1946	 * relying on the oom victim to make a forward progress and we can
1947	 * invoke the oom killer here.
1948	 *
1949	 * Please note that mem_cgroup_out_of_memory might fail to find a
1950	 * victim and then we have to bail out from the charge path.
1951	 */
1952	if (READ_ONCE(memcg->oom_kill_disable)) {
1953		if (current->in_user_fault) {
1954			css_get(&memcg->css);
1955			current->memcg_in_oom = memcg;
1956			current->memcg_oom_gfp_mask = mask;
1957			current->memcg_oom_order = order;
1958		}
1959		return false;
1960	}
1961
1962	mem_cgroup_mark_under_oom(memcg);
1963
1964	locked = mem_cgroup_oom_trylock(memcg);
1965
1966	if (locked)
1967		mem_cgroup_oom_notify(memcg);
1968
1969	mem_cgroup_unmark_under_oom(memcg);
1970	ret = mem_cgroup_out_of_memory(memcg, mask, order);
1971
1972	if (locked)
1973		mem_cgroup_oom_unlock(memcg);
1974
1975	return ret;
1976}
1977
1978/**
1979 * mem_cgroup_oom_synchronize - complete memcg OOM handling
1980 * @handle: actually kill/wait or just clean up the OOM state
1981 *
1982 * This has to be called at the end of a page fault if the memcg OOM
1983 * handler was enabled.
1984 *
1985 * Memcg supports userspace OOM handling where failed allocations must
1986 * sleep on a waitqueue until the userspace task resolves the
1987 * situation.  Sleeping directly in the charge context with all kinds
1988 * of locks held is not a good idea, instead we remember an OOM state
1989 * in the task and mem_cgroup_oom_synchronize() has to be called at
1990 * the end of the page fault to complete the OOM handling.
1991 *
1992 * Returns %true if an ongoing memcg OOM situation was detected and
1993 * completed, %false otherwise.
1994 */
1995bool mem_cgroup_oom_synchronize(bool handle)
1996{
1997	struct mem_cgroup *memcg = current->memcg_in_oom;
1998	struct oom_wait_info owait;
1999	bool locked;
2000
2001	/* OOM is global, do not handle */
2002	if (!memcg)
2003		return false;
2004
2005	if (!handle)
2006		goto cleanup;
2007
2008	owait.memcg = memcg;
2009	owait.wait.flags = 0;
2010	owait.wait.func = memcg_oom_wake_function;
2011	owait.wait.private = current;
2012	INIT_LIST_HEAD(&owait.wait.entry);
2013
2014	prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
2015	mem_cgroup_mark_under_oom(memcg);
2016
2017	locked = mem_cgroup_oom_trylock(memcg);
2018
2019	if (locked)
2020		mem_cgroup_oom_notify(memcg);
2021
2022	schedule();
2023	mem_cgroup_unmark_under_oom(memcg);
2024	finish_wait(&memcg_oom_waitq, &owait.wait);
2025
2026	if (locked)
2027		mem_cgroup_oom_unlock(memcg);
2028cleanup:
2029	current->memcg_in_oom = NULL;
2030	css_put(&memcg->css);
2031	return true;
2032}
2033
2034/**
2035 * mem_cgroup_get_oom_group - get a memory cgroup to clean up after OOM
2036 * @victim: task to be killed by the OOM killer
2037 * @oom_domain: memcg in case of memcg OOM, NULL in case of system-wide OOM
2038 *
2039 * Returns a pointer to a memory cgroup, which has to be cleaned up
2040 * by killing all belonging OOM-killable tasks.
2041 *
2042 * Caller has to call mem_cgroup_put() on the returned non-NULL memcg.
2043 */
2044struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim,
2045					    struct mem_cgroup *oom_domain)
2046{
2047	struct mem_cgroup *oom_group = NULL;
2048	struct mem_cgroup *memcg;
2049
2050	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
2051		return NULL;
2052
2053	if (!oom_domain)
2054		oom_domain = root_mem_cgroup;
2055
2056	rcu_read_lock();
2057
2058	memcg = mem_cgroup_from_task(victim);
2059	if (mem_cgroup_is_root(memcg))
2060		goto out;
2061
2062	/*
2063	 * If the victim task has been asynchronously moved to a different
2064	 * memory cgroup, we might end up killing tasks outside oom_domain.
2065	 * In this case it's better to ignore memory.group.oom.
2066	 */
2067	if (unlikely(!mem_cgroup_is_descendant(memcg, oom_domain)))
2068		goto out;
2069
2070	/*
2071	 * Traverse the memory cgroup hierarchy from the victim task's
2072	 * cgroup up to the OOMing cgroup (or root) to find the
2073	 * highest-level memory cgroup with oom.group set.
2074	 */
2075	for (; memcg; memcg = parent_mem_cgroup(memcg)) {
2076		if (READ_ONCE(memcg->oom_group))
2077			oom_group = memcg;
2078
2079		if (memcg == oom_domain)
2080			break;
2081	}
2082
2083	if (oom_group)
2084		css_get(&oom_group->css);
2085out:
2086	rcu_read_unlock();
2087
2088	return oom_group;
2089}
2090
2091void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
2092{
2093	pr_info("Tasks in ");
2094	pr_cont_cgroup_path(memcg->css.cgroup);
2095	pr_cont(" are going to be killed due to memory.oom.group set\n");
2096}
2097
2098/**
2099 * folio_memcg_lock - Bind a folio to its memcg.
2100 * @folio: The folio.
2101 *
2102 * This function prevents unlocked LRU folios from being moved to
2103 * another cgroup.
2104 *
2105 * It ensures lifetime of the bound memcg.  The caller is responsible
2106 * for the lifetime of the folio.
2107 */
2108void folio_memcg_lock(struct folio *folio)
2109{
2110	struct mem_cgroup *memcg;
2111	unsigned long flags;
2112
2113	/*
2114	 * The RCU lock is held throughout the transaction.  The fast
2115	 * path can get away without acquiring the memcg->move_lock
2116	 * because page moving starts with an RCU grace period.
2117         */
2118	rcu_read_lock();
2119
2120	if (mem_cgroup_disabled())
2121		return;
2122again:
2123	memcg = folio_memcg(folio);
2124	if (unlikely(!memcg))
2125		return;
2126
2127#ifdef CONFIG_PROVE_LOCKING
2128	local_irq_save(flags);
2129	might_lock(&memcg->move_lock);
2130	local_irq_restore(flags);
2131#endif
2132
2133	if (atomic_read(&memcg->moving_account) <= 0)
2134		return;
2135
2136	spin_lock_irqsave(&memcg->move_lock, flags);
2137	if (memcg != folio_memcg(folio)) {
2138		spin_unlock_irqrestore(&memcg->move_lock, flags);
2139		goto again;
2140	}
2141
2142	/*
2143	 * When charge migration first begins, we can have multiple
2144	 * critical sections holding the fast-path RCU lock and one
2145	 * holding the slowpath move_lock. Track the task who has the
2146	 * move_lock for folio_memcg_unlock().
2147	 */
2148	memcg->move_lock_task = current;
2149	memcg->move_lock_flags = flags;
2150}
2151
2152static void __folio_memcg_unlock(struct mem_cgroup *memcg)
2153{
2154	if (memcg && memcg->move_lock_task == current) {
2155		unsigned long flags = memcg->move_lock_flags;
2156
2157		memcg->move_lock_task = NULL;
2158		memcg->move_lock_flags = 0;
2159
2160		spin_unlock_irqrestore(&memcg->move_lock, flags);
2161	}
2162
2163	rcu_read_unlock();
2164}
2165
2166/**
2167 * folio_memcg_unlock - Release the binding between a folio and its memcg.
2168 * @folio: The folio.
2169 *
2170 * This releases the binding created by folio_memcg_lock().  This does
2171 * not change the accounting of this folio to its memcg, but it does
2172 * permit others to change it.
2173 */
2174void folio_memcg_unlock(struct folio *folio)
2175{
2176	__folio_memcg_unlock(folio_memcg(folio));
2177}
2178
2179struct memcg_stock_pcp {
2180	local_lock_t stock_lock;
2181	struct mem_cgroup *cached; /* this never be root cgroup */
2182	unsigned int nr_pages;
2183
2184#ifdef CONFIG_MEMCG_KMEM
2185	struct obj_cgroup *cached_objcg;
2186	struct pglist_data *cached_pgdat;
2187	unsigned int nr_bytes;
2188	int nr_slab_reclaimable_b;
2189	int nr_slab_unreclaimable_b;
2190#endif
2191
2192	struct work_struct work;
2193	unsigned long flags;
2194#define FLUSHING_CACHED_CHARGE	0
2195};
2196static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock) = {
2197	.stock_lock = INIT_LOCAL_LOCK(stock_lock),
2198};
2199static DEFINE_MUTEX(percpu_charge_mutex);
2200
2201#ifdef CONFIG_MEMCG_KMEM
2202static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock);
2203static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
2204				     struct mem_cgroup *root_memcg);
2205static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages);
2206
2207#else
2208static inline struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
2209{
2210	return NULL;
2211}
2212static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
2213				     struct mem_cgroup *root_memcg)
2214{
2215	return false;
2216}
2217static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages)
2218{
2219}
2220#endif
2221
2222/**
2223 * consume_stock: Try to consume stocked charge on this cpu.
2224 * @memcg: memcg to consume from.
2225 * @nr_pages: how many pages to charge.
2226 *
2227 * The charges will only happen if @memcg matches the current cpu's memcg
2228 * stock, and at least @nr_pages are available in that stock.  Failure to
2229 * service an allocation will refill the stock.
2230 *
2231 * returns true if successful, false otherwise.
2232 */
2233static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2234{
2235	struct memcg_stock_pcp *stock;
2236	unsigned long flags;
2237	bool ret = false;
2238
2239	if (nr_pages > MEMCG_CHARGE_BATCH)
2240		return ret;
2241
2242	local_lock_irqsave(&memcg_stock.stock_lock, flags);
2243
2244	stock = this_cpu_ptr(&memcg_stock);
2245	if (memcg == READ_ONCE(stock->cached) && stock->nr_pages >= nr_pages) {
2246		stock->nr_pages -= nr_pages;
2247		ret = true;
2248	}
2249
2250	local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
2251
2252	return ret;
2253}
2254
2255/*
2256 * Returns stocks cached in percpu and reset cached information.
2257 */
2258static void drain_stock(struct memcg_stock_pcp *stock)
2259{
2260	struct mem_cgroup *old = READ_ONCE(stock->cached);
2261
2262	if (!old)
2263		return;
2264
2265	if (stock->nr_pages) {
2266		page_counter_uncharge(&old->memory, stock->nr_pages);
2267		if (do_memsw_account())
2268			page_counter_uncharge(&old->memsw, stock->nr_pages);
2269		stock->nr_pages = 0;
2270	}
2271
2272	css_put(&old->css);
2273	WRITE_ONCE(stock->cached, NULL);
2274}
2275
2276static void drain_local_stock(struct work_struct *dummy)
2277{
2278	struct memcg_stock_pcp *stock;
2279	struct obj_cgroup *old = NULL;
2280	unsigned long flags;
2281
2282	/*
2283	 * The only protection from cpu hotplug (memcg_hotplug_cpu_dead) vs.
2284	 * drain_stock races is that we always operate on local CPU stock
2285	 * here with IRQ disabled
2286	 */
2287	local_lock_irqsave(&memcg_stock.stock_lock, flags);
2288
2289	stock = this_cpu_ptr(&memcg_stock);
2290	old = drain_obj_stock(stock);
2291	drain_stock(stock);
2292	clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
2293
2294	local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
2295	if (old)
2296		obj_cgroup_put(old);
2297}
2298
2299/*
2300 * Cache charges(val) to local per_cpu area.
2301 * This will be consumed by consume_stock() function, later.
2302 */
2303static void __refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2304{
2305	struct memcg_stock_pcp *stock;
2306
2307	stock = this_cpu_ptr(&memcg_stock);
2308	if (READ_ONCE(stock->cached) != memcg) { /* reset if necessary */
2309		drain_stock(stock);
2310		css_get(&memcg->css);
2311		WRITE_ONCE(stock->cached, memcg);
2312	}
2313	stock->nr_pages += nr_pages;
2314
2315	if (stock->nr_pages > MEMCG_CHARGE_BATCH)
2316		drain_stock(stock);
2317}
2318
2319static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2320{
2321	unsigned long flags;
2322
2323	local_lock_irqsave(&memcg_stock.stock_lock, flags);
2324	__refill_stock(memcg, nr_pages);
2325	local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
2326}
2327
2328/*
2329 * Drains all per-CPU charge caches for given root_memcg resp. subtree
2330 * of the hierarchy under it.
2331 */
2332static void drain_all_stock(struct mem_cgroup *root_memcg)
2333{
2334	int cpu, curcpu;
2335
2336	/* If someone's already draining, avoid adding running more workers. */
2337	if (!mutex_trylock(&percpu_charge_mutex))
2338		return;
2339	/*
2340	 * Notify other cpus that system-wide "drain" is running
2341	 * We do not care about races with the cpu hotplug because cpu down
2342	 * as well as workers from this path always operate on the local
2343	 * per-cpu data. CPU up doesn't touch memcg_stock at all.
2344	 */
2345	migrate_disable();
2346	curcpu = smp_processor_id();
2347	for_each_online_cpu(cpu) {
2348		struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2349		struct mem_cgroup *memcg;
2350		bool flush = false;
2351
2352		rcu_read_lock();
2353		memcg = READ_ONCE(stock->cached);
2354		if (memcg && stock->nr_pages &&
2355		    mem_cgroup_is_descendant(memcg, root_memcg))
2356			flush = true;
2357		else if (obj_stock_flush_required(stock, root_memcg))
2358			flush = true;
2359		rcu_read_unlock();
2360
2361		if (flush &&
2362		    !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
2363			if (cpu == curcpu)
2364				drain_local_stock(&stock->work);
2365			else if (!cpu_is_isolated(cpu))
2366				schedule_work_on(cpu, &stock->work);
2367		}
2368	}
2369	migrate_enable();
2370	mutex_unlock(&percpu_charge_mutex);
2371}
2372
2373static int memcg_hotplug_cpu_dead(unsigned int cpu)
2374{
2375	struct memcg_stock_pcp *stock;
2376
2377	stock = &per_cpu(memcg_stock, cpu);
2378	drain_stock(stock);
2379
2380	return 0;
2381}
2382
2383static unsigned long reclaim_high(struct mem_cgroup *memcg,
2384				  unsigned int nr_pages,
2385				  gfp_t gfp_mask)
2386{
2387	unsigned long nr_reclaimed = 0;
2388
2389	do {
2390		unsigned long pflags;
2391
2392		if (page_counter_read(&memcg->memory) <=
2393		    READ_ONCE(memcg->memory.high))
2394			continue;
2395
2396		memcg_memory_event(memcg, MEMCG_HIGH);
2397
2398		psi_memstall_enter(&pflags);
2399		nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
2400							gfp_mask,
2401							MEMCG_RECLAIM_MAY_SWAP);
2402		psi_memstall_leave(&pflags);
2403	} while ((memcg = parent_mem_cgroup(memcg)) &&
2404		 !mem_cgroup_is_root(memcg));
2405
2406	return nr_reclaimed;
2407}
2408
2409static void high_work_func(struct work_struct *work)
2410{
2411	struct mem_cgroup *memcg;
2412
2413	memcg = container_of(work, struct mem_cgroup, high_work);
2414	reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL);
2415}
2416
2417/*
2418 * Clamp the maximum sleep time per allocation batch to 2 seconds. This is
2419 * enough to still cause a significant slowdown in most cases, while still
2420 * allowing diagnostics and tracing to proceed without becoming stuck.
2421 */
2422#define MEMCG_MAX_HIGH_DELAY_JIFFIES (2UL*HZ)
2423
2424/*
2425 * When calculating the delay, we use these either side of the exponentiation to
2426 * maintain precision and scale to a reasonable number of jiffies (see the table
2427 * below.
2428 *
2429 * - MEMCG_DELAY_PRECISION_SHIFT: Extra precision bits while translating the
2430 *   overage ratio to a delay.
2431 * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down the
2432 *   proposed penalty in order to reduce to a reasonable number of jiffies, and
2433 *   to produce a reasonable delay curve.
2434 *
2435 * MEMCG_DELAY_SCALING_SHIFT just happens to be a number that produces a
2436 * reasonable delay curve compared to precision-adjusted overage, not
2437 * penalising heavily at first, but still making sure that growth beyond the
2438 * limit penalises misbehaviour cgroups by slowing them down exponentially. For
2439 * example, with a high of 100 megabytes:
2440 *
2441 *  +-------+------------------------+
2442 *  | usage | time to allocate in ms |
2443 *  +-------+------------------------+
2444 *  | 100M  |                      0 |
2445 *  | 101M  |                      6 |
2446 *  | 102M  |                     25 |
2447 *  | 103M  |                     57 |
2448 *  | 104M  |                    102 |
2449 *  | 105M  |                    159 |
2450 *  | 106M  |                    230 |
2451 *  | 107M  |                    313 |
2452 *  | 108M  |                    409 |
2453 *  | 109M  |                    518 |
2454 *  | 110M  |                    639 |
2455 *  | 111M  |                    774 |
2456 *  | 112M  |                    921 |
2457 *  | 113M  |                   1081 |
2458 *  | 114M  |                   1254 |
2459 *  | 115M  |                   1439 |
2460 *  | 116M  |                   1638 |
2461 *  | 117M  |                   1849 |
2462 *  | 118M  |                   2000 |
2463 *  | 119M  |                   2000 |
2464 *  | 120M  |                   2000 |
2465 *  +-------+------------------------+
2466 */
2467 #define MEMCG_DELAY_PRECISION_SHIFT 20
2468 #define MEMCG_DELAY_SCALING_SHIFT 14
2469
2470static u64 calculate_overage(unsigned long usage, unsigned long high)
2471{
2472	u64 overage;
2473
2474	if (usage <= high)
2475		return 0;
2476
2477	/*
2478	 * Prevent division by 0 in overage calculation by acting as if
2479	 * it was a threshold of 1 page
2480	 */
2481	high = max(high, 1UL);
2482
2483	overage = usage - high;
2484	overage <<= MEMCG_DELAY_PRECISION_SHIFT;
2485	return div64_u64(overage, high);
2486}
2487
2488static u64 mem_find_max_overage(struct mem_cgroup *memcg)
2489{
2490	u64 overage, max_overage = 0;
2491
2492	do {
2493		overage = calculate_overage(page_counter_read(&memcg->memory),
2494					    READ_ONCE(memcg->memory.high));
2495		max_overage = max(overage, max_overage);
2496	} while ((memcg = parent_mem_cgroup(memcg)) &&
2497		 !mem_cgroup_is_root(memcg));
2498
2499	return max_overage;
2500}
2501
2502static u64 swap_find_max_overage(struct mem_cgroup *memcg)
2503{
2504	u64 overage, max_overage = 0;
2505
2506	do {
2507		overage = calculate_overage(page_counter_read(&memcg->swap),
2508					    READ_ONCE(memcg->swap.high));
2509		if (overage)
2510			memcg_memory_event(memcg, MEMCG_SWAP_HIGH);
2511		max_overage = max(overage, max_overage);
2512	} while ((memcg = parent_mem_cgroup(memcg)) &&
2513		 !mem_cgroup_is_root(memcg));
2514
2515	return max_overage;
2516}
2517
2518/*
2519 * Get the number of jiffies that we should penalise a mischievous cgroup which
2520 * is exceeding its memory.high by checking both it and its ancestors.
2521 */
2522static unsigned long calculate_high_delay(struct mem_cgroup *memcg,
2523					  unsigned int nr_pages,
2524					  u64 max_overage)
2525{
2526	unsigned long penalty_jiffies;
2527
2528	if (!max_overage)
2529		return 0;
2530
2531	/*
2532	 * We use overage compared to memory.high to calculate the number of
2533	 * jiffies to sleep (penalty_jiffies). Ideally this value should be
2534	 * fairly lenient on small overages, and increasingly harsh when the
2535	 * memcg in question makes it clear that it has no intention of stopping
2536	 * its crazy behaviour, so we exponentially increase the delay based on
2537	 * overage amount.
2538	 */
2539	penalty_jiffies = max_overage * max_overage * HZ;
2540	penalty_jiffies >>= MEMCG_DELAY_PRECISION_SHIFT;
2541	penalty_jiffies >>= MEMCG_DELAY_SCALING_SHIFT;
2542
2543	/*
2544	 * Factor in the task's own contribution to the overage, such that four
2545	 * N-sized allocations are throttled approximately the same as one
2546	 * 4N-sized allocation.
2547	 *
2548	 * MEMCG_CHARGE_BATCH pages is nominal, so work out how much smaller or
2549	 * larger the current charge patch is than that.
2550	 */
2551	return penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH;
2552}
2553
2554/*
2555 * Scheduled by try_charge() to be executed from the userland return path
2556 * and reclaims memory over the high limit.
2557 */
2558void mem_cgroup_handle_over_high(gfp_t gfp_mask)
2559{
2560	unsigned long penalty_jiffies;
2561	unsigned long pflags;
2562	unsigned long nr_reclaimed;
2563	unsigned int nr_pages = current->memcg_nr_pages_over_high;
2564	int nr_retries = MAX_RECLAIM_RETRIES;
2565	struct mem_cgroup *memcg;
2566	bool in_retry = false;
2567
2568	if (likely(!nr_pages))
2569		return;
2570
2571	memcg = get_mem_cgroup_from_mm(current->mm);
2572	current->memcg_nr_pages_over_high = 0;
2573
2574retry_reclaim:
2575	/*
2576	 * The allocating task should reclaim at least the batch size, but for
2577	 * subsequent retries we only want to do what's necessary to prevent oom
2578	 * or breaching resource isolation.
2579	 *
2580	 * This is distinct from memory.max or page allocator behaviour because
2581	 * memory.high is currently batched, whereas memory.max and the page
2582	 * allocator run every time an allocation is made.
2583	 */
2584	nr_reclaimed = reclaim_high(memcg,
2585				    in_retry ? SWAP_CLUSTER_MAX : nr_pages,
2586				    gfp_mask);
2587
2588	/*
2589	 * memory.high is breached and reclaim is unable to keep up. Throttle
2590	 * allocators proactively to slow down excessive growth.
2591	 */
2592	penalty_jiffies = calculate_high_delay(memcg, nr_pages,
2593					       mem_find_max_overage(memcg));
2594
2595	penalty_jiffies += calculate_high_delay(memcg, nr_pages,
2596						swap_find_max_overage(memcg));
2597
2598	/*
2599	 * Clamp the max delay per usermode return so as to still keep the
2600	 * application moving forwards and also permit diagnostics, albeit
2601	 * extremely slowly.
2602	 */
2603	penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES);
2604
2605	/*
2606	 * Don't sleep if the amount of jiffies this memcg owes us is so low
2607	 * that it's not even worth doing, in an attempt to be nice to those who
2608	 * go only a small amount over their memory.high value and maybe haven't
2609	 * been aggressively reclaimed enough yet.
2610	 */
2611	if (penalty_jiffies <= HZ / 100)
2612		goto out;
2613
2614	/*
2615	 * If reclaim is making forward progress but we're still over
2616	 * memory.high, we want to encourage that rather than doing allocator
2617	 * throttling.
2618	 */
2619	if (nr_reclaimed || nr_retries--) {
2620		in_retry = true;
2621		goto retry_reclaim;
2622	}
2623
2624	/*
2625	 * If we exit early, we're guaranteed to die (since
2626	 * schedule_timeout_killable sets TASK_KILLABLE). This means we don't
2627	 * need to account for any ill-begotten jiffies to pay them off later.
2628	 */
2629	psi_memstall_enter(&pflags);
2630	schedule_timeout_killable(penalty_jiffies);
2631	psi_memstall_leave(&pflags);
2632
2633out:
2634	css_put(&memcg->css);
2635}
2636
2637static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
2638			unsigned int nr_pages)
2639{
2640	unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
2641	int nr_retries = MAX_RECLAIM_RETRIES;
2642	struct mem_cgroup *mem_over_limit;
2643	struct page_counter *counter;
2644	unsigned long nr_reclaimed;
2645	bool passed_oom = false;
2646	unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP;
2647	bool drained = false;
2648	bool raised_max_event = false;
2649	unsigned long pflags;
2650
2651retry:
2652	if (consume_stock(memcg, nr_pages))
2653		return 0;
2654
2655	if (!do_memsw_account() ||
2656	    page_counter_try_charge(&memcg->memsw, batch, &counter)) {
2657		if (page_counter_try_charge(&memcg->memory, batch, &counter))
2658			goto done_restock;
2659		if (do_memsw_account())
2660			page_counter_uncharge(&memcg->memsw, batch);
2661		mem_over_limit = mem_cgroup_from_counter(counter, memory);
2662	} else {
2663		mem_over_limit = mem_cgroup_from_counter(counter, memsw);
2664		reclaim_options &= ~MEMCG_RECLAIM_MAY_SWAP;
2665	}
2666
2667	if (batch > nr_pages) {
2668		batch = nr_pages;
2669		goto retry;
2670	}
2671
2672	/*
2673	 * Prevent unbounded recursion when reclaim operations need to
2674	 * allocate memory. This might exceed the limits temporarily,
2675	 * but we prefer facilitating memory reclaim and getting back
2676	 * under the limit over triggering OOM kills in these cases.
2677	 */
2678	if (unlikely(current->flags & PF_MEMALLOC))
2679		goto force;
2680
2681	if (unlikely(task_in_memcg_oom(current)))
2682		goto nomem;
2683
2684	if (!gfpflags_allow_blocking(gfp_mask))
2685		goto nomem;
2686
2687	memcg_memory_event(mem_over_limit, MEMCG_MAX);
2688	raised_max_event = true;
2689
2690	psi_memstall_enter(&pflags);
2691	nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
2692						    gfp_mask, reclaim_options);
2693	psi_memstall_leave(&pflags);
2694
2695	if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
2696		goto retry;
2697
2698	if (!drained) {
2699		drain_all_stock(mem_over_limit);
2700		drained = true;
2701		goto retry;
2702	}
2703
2704	if (gfp_mask & __GFP_NORETRY)
2705		goto nomem;
2706	/*
2707	 * Even though the limit is exceeded at this point, reclaim
2708	 * may have been able to free some pages.  Retry the charge
2709	 * before killing the task.
2710	 *
2711	 * Only for regular pages, though: huge pages are rather
2712	 * unlikely to succeed so close to the limit, and we fall back
2713	 * to regular pages anyway in case of failure.
2714	 */
2715	if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER))
2716		goto retry;
2717	/*
2718	 * At task move, charge accounts can be doubly counted. So, it's
2719	 * better to wait until the end of task_move if something is going on.
2720	 */
2721	if (mem_cgroup_wait_acct_move(mem_over_limit))
2722		goto retry;
2723
2724	if (nr_retries--)
2725		goto retry;
2726
2727	if (gfp_mask & __GFP_RETRY_MAYFAIL)
2728		goto nomem;
2729
2730	/* Avoid endless loop for tasks bypassed by the oom killer */
2731	if (passed_oom && task_is_dying())
2732		goto nomem;
2733
2734	/*
2735	 * keep retrying as long as the memcg oom killer is able to make
2736	 * a forward progress or bypass the charge if the oom killer
2737	 * couldn't make any progress.
2738	 */
2739	if (mem_cgroup_oom(mem_over_limit, gfp_mask,
2740			   get_order(nr_pages * PAGE_SIZE))) {
2741		passed_oom = true;
2742		nr_retries = MAX_RECLAIM_RETRIES;
2743		goto retry;
2744	}
2745nomem:
2746	/*
2747	 * Memcg doesn't have a dedicated reserve for atomic
2748	 * allocations. But like the global atomic pool, we need to
2749	 * put the burden of reclaim on regular allocation requests
2750	 * and let these go through as privileged allocations.
2751	 */
2752	if (!(gfp_mask & (__GFP_NOFAIL | __GFP_HIGH)))
2753		return -ENOMEM;
2754force:
2755	/*
2756	 * If the allocation has to be enforced, don't forget to raise
2757	 * a MEMCG_MAX event.
2758	 */
2759	if (!raised_max_event)
2760		memcg_memory_event(mem_over_limit, MEMCG_MAX);
2761
2762	/*
2763	 * The allocation either can't fail or will lead to more memory
2764	 * being freed very soon.  Allow memory usage go over the limit
2765	 * temporarily by force charging it.
2766	 */
2767	page_counter_charge(&memcg->memory, nr_pages);
2768	if (do_memsw_account())
2769		page_counter_charge(&memcg->memsw, nr_pages);
2770
2771	return 0;
2772
2773done_restock:
2774	if (batch > nr_pages)
2775		refill_stock(memcg, batch - nr_pages);
2776
2777	/*
2778	 * If the hierarchy is above the normal consumption range, schedule
2779	 * reclaim on returning to userland.  We can perform reclaim here
2780	 * if __GFP_RECLAIM but let's always punt for simplicity and so that
2781	 * GFP_KERNEL can consistently be used during reclaim.  @memcg is
2782	 * not recorded as it most likely matches current's and won't
2783	 * change in the meantime.  As high limit is checked again before
2784	 * reclaim, the cost of mismatch is negligible.
2785	 */
2786	do {
2787		bool mem_high, swap_high;
2788
2789		mem_high = page_counter_read(&memcg->memory) >
2790			READ_ONCE(memcg->memory.high);
2791		swap_high = page_counter_read(&memcg->swap) >
2792			READ_ONCE(memcg->swap.high);
2793
2794		/* Don't bother a random interrupted task */
2795		if (!in_task()) {
2796			if (mem_high) {
2797				schedule_work(&memcg->high_work);
2798				break;
2799			}
2800			continue;
2801		}
2802
2803		if (mem_high || swap_high) {
2804			/*
2805			 * The allocating tasks in this cgroup will need to do
2806			 * reclaim or be throttled to prevent further growth
2807			 * of the memory or swap footprints.
2808			 *
2809			 * Target some best-effort fairness between the tasks,
2810			 * and distribute reclaim work and delay penalties
2811			 * based on how much each task is actually allocating.
2812			 */
2813			current->memcg_nr_pages_over_high += batch;
2814			set_notify_resume(current);
2815			break;
2816		}
2817	} while ((memcg = parent_mem_cgroup(memcg)));
2818
2819	if (current->memcg_nr_pages_over_high > MEMCG_CHARGE_BATCH &&
2820	    !(current->flags & PF_MEMALLOC) &&
2821	    gfpflags_allow_blocking(gfp_mask)) {
2822		mem_cgroup_handle_over_high(gfp_mask);
2823	}
2824	return 0;
2825}
2826
2827static inline int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2828			     unsigned int nr_pages)
2829{
2830	if (mem_cgroup_is_root(memcg))
2831		return 0;
2832
2833	return try_charge_memcg(memcg, gfp_mask, nr_pages);
2834}
2835
2836static inline void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
2837{
2838	if (mem_cgroup_is_root(memcg))
2839		return;
2840
2841	page_counter_uncharge(&memcg->memory, nr_pages);
2842	if (do_memsw_account())
2843		page_counter_uncharge(&memcg->memsw, nr_pages);
2844}
2845
2846static void commit_charge(struct folio *folio, struct mem_cgroup *memcg)
2847{
2848	VM_BUG_ON_FOLIO(folio_memcg(folio), folio);
2849	/*
2850	 * Any of the following ensures page's memcg stability:
2851	 *
2852	 * - the page lock
2853	 * - LRU isolation
2854	 * - folio_memcg_lock()
2855	 * - exclusive reference
2856	 * - mem_cgroup_trylock_pages()
2857	 */
2858	folio->memcg_data = (unsigned long)memcg;
2859}
2860
2861#ifdef CONFIG_MEMCG_KMEM
2862/*
2863 * The allocated objcg pointers array is not accounted directly.
2864 * Moreover, it should not come from DMA buffer and is not readily
2865 * reclaimable. So those GFP bits should be masked off.
2866 */
2867#define OBJCGS_CLEAR_MASK	(__GFP_DMA | __GFP_RECLAIMABLE | __GFP_ACCOUNT)
2868
2869/*
2870 * mod_objcg_mlstate() may be called with irq enabled, so
2871 * mod_memcg_lruvec_state() should be used.
2872 */
2873static inline void mod_objcg_mlstate(struct obj_cgroup *objcg,
2874				     struct pglist_data *pgdat,
2875				     enum node_stat_item idx, int nr)
2876{
2877	struct mem_cgroup *memcg;
2878	struct lruvec *lruvec;
2879
2880	rcu_read_lock();
2881	memcg = obj_cgroup_memcg(objcg);
2882	lruvec = mem_cgroup_lruvec(memcg, pgdat);
2883	mod_memcg_lruvec_state(lruvec, idx, nr);
2884	rcu_read_unlock();
2885}
2886
2887int memcg_alloc_slab_cgroups(struct slab *slab, struct kmem_cache *s,
2888				 gfp_t gfp, bool new_slab)
2889{
2890	unsigned int objects = objs_per_slab(s, slab);
2891	unsigned long memcg_data;
2892	void *vec;
2893
2894	gfp &= ~OBJCGS_CLEAR_MASK;
2895	vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp,
2896			   slab_nid(slab));
2897	if (!vec)
2898		return -ENOMEM;
2899
2900	memcg_data = (unsigned long) vec | MEMCG_DATA_OBJCGS;
2901	if (new_slab) {
2902		/*
2903		 * If the slab is brand new and nobody can yet access its
2904		 * memcg_data, no synchronization is required and memcg_data can
2905		 * be simply assigned.
2906		 */
2907		slab->memcg_data = memcg_data;
2908	} else if (cmpxchg(&slab->memcg_data, 0, memcg_data)) {
2909		/*
2910		 * If the slab is already in use, somebody can allocate and
2911		 * assign obj_cgroups in parallel. In this case the existing
2912		 * objcg vector should be reused.
2913		 */
2914		kfree(vec);
2915		return 0;
2916	}
2917
2918	kmemleak_not_leak(vec);
2919	return 0;
2920}
2921
2922static __always_inline
2923struct mem_cgroup *mem_cgroup_from_obj_folio(struct folio *folio, void *p)
2924{
2925	/*
2926	 * Slab objects are accounted individually, not per-page.
2927	 * Memcg membership data for each individual object is saved in
2928	 * slab->memcg_data.
2929	 */
2930	if (folio_test_slab(folio)) {
2931		struct obj_cgroup **objcgs;
2932		struct slab *slab;
2933		unsigned int off;
2934
2935		slab = folio_slab(folio);
2936		objcgs = slab_objcgs(slab);
2937		if (!objcgs)
2938			return NULL;
2939
2940		off = obj_to_index(slab->slab_cache, slab, p);
2941		if (objcgs[off])
2942			return obj_cgroup_memcg(objcgs[off]);
2943
2944		return NULL;
2945	}
2946
2947	/*
2948	 * folio_memcg_check() is used here, because in theory we can encounter
2949	 * a folio where the slab flag has been cleared already, but
2950	 * slab->memcg_data has not been freed yet
2951	 * folio_memcg_check() will guarantee that a proper memory
2952	 * cgroup pointer or NULL will be returned.
2953	 */
2954	return folio_memcg_check(folio);
2955}
2956
2957/*
2958 * Returns a pointer to the memory cgroup to which the kernel object is charged.
2959 *
2960 * A passed kernel object can be a slab object, vmalloc object or a generic
2961 * kernel page, so different mechanisms for getting the memory cgroup pointer
2962 * should be used.
2963 *
2964 * In certain cases (e.g. kernel stacks or large kmallocs with SLUB) the caller
2965 * can not know for sure how the kernel object is implemented.
2966 * mem_cgroup_from_obj() can be safely used in such cases.
2967 *
2968 * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(),
2969 * cgroup_mutex, etc.
2970 */
2971struct mem_cgroup *mem_cgroup_from_obj(void *p)
2972{
2973	struct folio *folio;
2974
2975	if (mem_cgroup_disabled())
2976		return NULL;
2977
2978	if (unlikely(is_vmalloc_addr(p)))
2979		folio = page_folio(vmalloc_to_page(p));
2980	else
2981		folio = virt_to_folio(p);
2982
2983	return mem_cgroup_from_obj_folio(folio, p);
2984}
2985
2986/*
2987 * Returns a pointer to the memory cgroup to which the kernel object is charged.
2988 * Similar to mem_cgroup_from_obj(), but faster and not suitable for objects,
2989 * allocated using vmalloc().
2990 *
2991 * A passed kernel object must be a slab object or a generic kernel page.
2992 *
2993 * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(),
2994 * cgroup_mutex, etc.
2995 */
2996struct mem_cgroup *mem_cgroup_from_slab_obj(void *p)
2997{
2998	if (mem_cgroup_disabled())
2999		return NULL;
3000
3001	return mem_cgroup_from_obj_folio(virt_to_folio(p), p);
3002}
3003
3004static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg)
3005{
3006	struct obj_cgroup *objcg = NULL;
3007
3008	for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) {
3009		objcg = rcu_dereference(memcg->objcg);
3010		if (objcg && obj_cgroup_tryget(objcg))
3011			break;
3012		objcg = NULL;
3013	}
3014	return objcg;
3015}
3016
3017__always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
3018{
3019	struct obj_cgroup *objcg = NULL;
3020	struct mem_cgroup *memcg;
3021
3022	if (memcg_kmem_bypass())
3023		return NULL;
3024
3025	rcu_read_lock();
3026	if (unlikely(active_memcg()))
3027		memcg = active_memcg();
3028	else
3029		memcg = mem_cgroup_from_task(current);
3030	objcg = __get_obj_cgroup_from_memcg(memcg);
3031	rcu_read_unlock();
3032	return objcg;
3033}
3034
3035struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio)
3036{
3037	struct obj_cgroup *objcg;
3038
3039	if (!memcg_kmem_online())
3040		return NULL;
3041
3042	if (folio_memcg_kmem(folio)) {
3043		objcg = __folio_objcg(folio);
3044		obj_cgroup_get(objcg);
3045	} else {
3046		struct mem_cgroup *memcg;
3047
3048		rcu_read_lock();
3049		memcg = __folio_memcg(folio);
3050		if (memcg)
3051			objcg = __get_obj_cgroup_from_memcg(memcg);
3052		else
3053			objcg = NULL;
3054		rcu_read_unlock();
3055	}
3056	return objcg;
3057}
3058
3059static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages)
3060{
3061	mod_memcg_state(memcg, MEMCG_KMEM, nr_pages);
3062	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
3063		if (nr_pages > 0)
3064			page_counter_charge(&memcg->kmem, nr_pages);
3065		else
3066			page_counter_uncharge(&memcg->kmem, -nr_pages);
3067	}
3068}
3069
3070
3071/*
3072 * obj_cgroup_uncharge_pages: uncharge a number of kernel pages from a objcg
3073 * @objcg: object cgroup to uncharge
3074 * @nr_pages: number of pages to uncharge
3075 */
3076static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
3077				      unsigned int nr_pages)
3078{
3079	struct mem_cgroup *memcg;
3080
3081	memcg = get_mem_cgroup_from_objcg(objcg);
3082
3083	memcg_account_kmem(memcg, -nr_pages);
3084	refill_stock(memcg, nr_pages);
3085
3086	css_put(&memcg->css);
3087}
3088
3089/*
3090 * obj_cgroup_charge_pages: charge a number of kernel pages to a objcg
3091 * @objcg: object cgroup to charge
3092 * @gfp: reclaim mode
3093 * @nr_pages: number of pages to charge
3094 *
3095 * Returns 0 on success, an error code on failure.
3096 */
3097static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp,
3098				   unsigned int nr_pages)
3099{
3100	struct mem_cgroup *memcg;
3101	int ret;
3102
3103	memcg = get_mem_cgroup_from_objcg(objcg);
3104
3105	ret = try_charge_memcg(memcg, gfp, nr_pages);
3106	if (ret)
3107		goto out;
3108
3109	memcg_account_kmem(memcg, nr_pages);
3110out:
3111	css_put(&memcg->css);
3112
3113	return ret;
3114}
3115
3116/**
3117 * __memcg_kmem_charge_page: charge a kmem page to the current memory cgroup
3118 * @page: page to charge
3119 * @gfp: reclaim mode
3120 * @order: allocation order
3121 *
3122 * Returns 0 on success, an error code on failure.
3123 */
3124int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
3125{
3126	struct obj_cgroup *objcg;
3127	int ret = 0;
3128
3129	objcg = get_obj_cgroup_from_current();
3130	if (objcg) {
3131		ret = obj_cgroup_charge_pages(objcg, gfp, 1 << order);
3132		if (!ret) {
3133			page->memcg_data = (unsigned long)objcg |
3134				MEMCG_DATA_KMEM;
3135			return 0;
3136		}
3137		obj_cgroup_put(objcg);
3138	}
3139	return ret;
3140}
3141
3142/**
3143 * __memcg_kmem_uncharge_page: uncharge a kmem page
3144 * @page: page to uncharge
3145 * @order: allocation order
3146 */
3147void __memcg_kmem_uncharge_page(struct page *page, int order)
3148{
3149	struct folio *folio = page_folio(page);
3150	struct obj_cgroup *objcg;
3151	unsigned int nr_pages = 1 << order;
3152
3153	if (!folio_memcg_kmem(folio))
3154		return;
3155
3156	objcg = __folio_objcg(folio);
3157	obj_cgroup_uncharge_pages(objcg, nr_pages);
3158	folio->memcg_data = 0;
3159	obj_cgroup_put(objcg);
3160}
3161
3162void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
3163		     enum node_stat_item idx, int nr)
3164{
3165	struct memcg_stock_pcp *stock;
3166	struct obj_cgroup *old = NULL;
3167	unsigned long flags;
3168	int *bytes;
3169
3170	local_lock_irqsave(&memcg_stock.stock_lock, flags);
3171	stock = this_cpu_ptr(&memcg_stock);
3172
3173	/*
3174	 * Save vmstat data in stock and skip vmstat array update unless
3175	 * accumulating over a page of vmstat data or when pgdat or idx
3176	 * changes.
3177	 */
3178	if (READ_ONCE(stock->cached_objcg) != objcg) {
3179		old = drain_obj_stock(stock);
3180		obj_cgroup_get(objcg);
3181		stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
3182				? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0;
3183		WRITE_ONCE(stock->cached_objcg, objcg);
3184		stock->cached_pgdat = pgdat;
3185	} else if (stock->cached_pgdat != pgdat) {
3186		/* Flush the existing cached vmstat data */
3187		struct pglist_data *oldpg = stock->cached_pgdat;
3188
3189		if (stock->nr_slab_reclaimable_b) {
3190			mod_objcg_mlstate(objcg, oldpg, NR_SLAB_RECLAIMABLE_B,
3191					  stock->nr_slab_reclaimable_b);
3192			stock->nr_slab_reclaimable_b = 0;
3193		}
3194		if (stock->nr_slab_unreclaimable_b) {
3195			mod_objcg_mlstate(objcg, oldpg, NR_SLAB_UNRECLAIMABLE_B,
3196					  stock->nr_slab_unreclaimable_b);
3197			stock->nr_slab_unreclaimable_b = 0;
3198		}
3199		stock->cached_pgdat = pgdat;
3200	}
3201
3202	bytes = (idx == NR_SLAB_RECLAIMABLE_B) ? &stock->nr_slab_reclaimable_b
3203					       : &stock->nr_slab_unreclaimable_b;
3204	/*
3205	 * Even for large object >= PAGE_SIZE, the vmstat data will still be
3206	 * cached locally at least once before pushing it out.
3207	 */
3208	if (!*bytes) {
3209		*bytes = nr;
3210		nr = 0;
3211	} else {
3212		*bytes += nr;
3213		if (abs(*bytes) > PAGE_SIZE) {
3214			nr = *bytes;
3215			*bytes = 0;
3216		} else {
3217			nr = 0;
3218		}
3219	}
3220	if (nr)
3221		mod_objcg_mlstate(objcg, pgdat, idx, nr);
3222
3223	local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
3224	if (old)
3225		obj_cgroup_put(old);
3226}
3227
3228static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
3229{
3230	struct memcg_stock_pcp *stock;
3231	unsigned long flags;
3232	bool ret = false;
3233
3234	local_lock_irqsave(&memcg_stock.stock_lock, flags);
3235
3236	stock = this_cpu_ptr(&memcg_stock);
3237	if (objcg == READ_ONCE(stock->cached_objcg) && stock->nr_bytes >= nr_bytes) {
3238		stock->nr_bytes -= nr_bytes;
3239		ret = true;
3240	}
3241
3242	local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
3243
3244	return ret;
3245}
3246
3247static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
3248{
3249	struct obj_cgroup *old = READ_ONCE(stock->cached_objcg);
3250
3251	if (!old)
3252		return NULL;
3253
3254	if (stock->nr_bytes) {
3255		unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT;
3256		unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1);
3257
3258		if (nr_pages) {
3259			struct mem_cgroup *memcg;
3260
3261			memcg = get_mem_cgroup_from_objcg(old);
3262
3263			memcg_account_kmem(memcg, -nr_pages);
3264			__refill_stock(memcg, nr_pages);
3265
3266			css_put(&memcg->css);
3267		}
3268
3269		/*
3270		 * The leftover is flushed to the centralized per-memcg value.
3271		 * On the next attempt to refill obj stock it will be moved
3272		 * to a per-cpu stock (probably, on an other CPU), see
3273		 * refill_obj_stock().
3274		 *
3275		 * How often it's flushed is a trade-off between the memory
3276		 * limit enforcement accuracy and potential CPU contention,
3277		 * so it might be changed in the future.
3278		 */
3279		atomic_add(nr_bytes, &old->nr_charged_bytes);
3280		stock->nr_bytes = 0;
3281	}
3282
3283	/*
3284	 * Flush the vmstat data in current stock
3285	 */
3286	if (stock->nr_slab_reclaimable_b || stock->nr_slab_unreclaimable_b) {
3287		if (stock->nr_slab_reclaimable_b) {
3288			mod_objcg_mlstate(old, stock->cached_pgdat,
3289					  NR_SLAB_RECLAIMABLE_B,
3290					  stock->nr_slab_reclaimable_b);
3291			stock->nr_slab_reclaimable_b = 0;
3292		}
3293		if (stock->nr_slab_unreclaimable_b) {
3294			mod_objcg_mlstate(old, stock->cached_pgdat,
3295					  NR_SLAB_UNRECLAIMABLE_B,
3296					  stock->nr_slab_unreclaimable_b);
3297			stock->nr_slab_unreclaimable_b = 0;
3298		}
3299		stock->cached_pgdat = NULL;
3300	}
3301
3302	WRITE_ONCE(stock->cached_objcg, NULL);
3303	/*
3304	 * The `old' objects needs to be released by the caller via
3305	 * obj_cgroup_put() outside of memcg_stock_pcp::stock_lock.
3306	 */
3307	return old;
3308}
3309
3310static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
3311				     struct mem_cgroup *root_memcg)
3312{
3313	struct obj_cgroup *objcg = READ_ONCE(stock->cached_objcg);
3314	struct mem_cgroup *memcg;
3315
3316	if (objcg) {
3317		memcg = obj_cgroup_memcg(objcg);
3318		if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
3319			return true;
3320	}
3321
3322	return false;
3323}
3324
3325static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
3326			     bool allow_uncharge)
3327{
3328	struct memcg_stock_pcp *stock;
3329	struct obj_cgroup *old = NULL;
3330	unsigned long flags;
3331	unsigned int nr_pages = 0;
3332
3333	local_lock_irqsave(&memcg_stock.stock_lock, flags);
3334
3335	stock = this_cpu_ptr(&memcg_stock);
3336	if (READ_ONCE(stock->cached_objcg) != objcg) { /* reset if necessary */
3337		old = drain_obj_stock(stock);
3338		obj_cgroup_get(objcg);
3339		WRITE_ONCE(stock->cached_objcg, objcg);
3340		stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
3341				? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0;
3342		allow_uncharge = true;	/* Allow uncharge when objcg changes */
3343	}
3344	stock->nr_bytes += nr_bytes;
3345
3346	if (allow_uncharge && (stock->nr_bytes > PAGE_SIZE)) {
3347		nr_pages = stock->nr_bytes >> PAGE_SHIFT;
3348		stock->nr_bytes &= (PAGE_SIZE - 1);
3349	}
3350
3351	local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
3352	if (old)
3353		obj_cgroup_put(old);
3354
3355	if (nr_pages)
3356		obj_cgroup_uncharge_pages(objcg, nr_pages);
3357}
3358
3359int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
3360{
3361	unsigned int nr_pages, nr_bytes;
3362	int ret;
3363
3364	if (consume_obj_stock(objcg, size))
3365		return 0;
3366
3367	/*
3368	 * In theory, objcg->nr_charged_bytes can have enough
3369	 * pre-charged bytes to satisfy the allocation. However,
3370	 * flushing objcg->nr_charged_bytes requires two atomic
3371	 * operations, and objcg->nr_charged_bytes can't be big.
3372	 * The shared objcg->nr_charged_bytes can also become a
3373	 * performance bottleneck if all tasks of the same memcg are
3374	 * trying to update it. So it's better to ignore it and try
3375	 * grab some new pages. The stock's nr_bytes will be flushed to
3376	 * objcg->nr_charged_bytes later on when objcg changes.
3377	 *
3378	 * The stock's nr_bytes may contain enough pre-charged bytes
3379	 * to allow one less page from being charged, but we can't rely
3380	 * on the pre-charged bytes not being changed outside of
3381	 * consume_obj_stock() or refill_obj_stock(). So ignore those
3382	 * pre-charged bytes as well when charging pages. To avoid a
3383	 * page uncharge right after a page charge, we set the
3384	 * allow_uncharge flag to false when calling refill_obj_stock()
3385	 * to temporarily allow the pre-charged bytes to exceed the page
3386	 * size limit. The maximum reachable value of the pre-charged
3387	 * bytes is (sizeof(object) + PAGE_SIZE - 2) if there is no data
3388	 * race.
3389	 */
3390	nr_pages = size >> PAGE_SHIFT;
3391	nr_bytes = size & (PAGE_SIZE - 1);
3392
3393	if (nr_bytes)
3394		nr_pages += 1;
3395
3396	ret = obj_cgroup_charge_pages(objcg, gfp, nr_pages);
3397	if (!ret && nr_bytes)
3398		refill_obj_stock(objcg, PAGE_SIZE - nr_bytes, false);
3399
3400	return ret;
3401}
3402
3403void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
3404{
3405	refill_obj_stock(objcg, size, true);
3406}
3407
3408#endif /* CONFIG_MEMCG_KMEM */
3409
3410/*
3411 * Because page_memcg(head) is not set on tails, set it now.
3412 */
3413void split_page_memcg(struct page *head, unsigned int nr)
3414{
3415	struct folio *folio = page_folio(head);
3416	struct mem_cgroup *memcg = folio_memcg(folio);
3417	int i;
3418
3419	if (mem_cgroup_disabled() || !memcg)
3420		return;
3421
3422	for (i = 1; i < nr; i++)
3423		folio_page(folio, i)->memcg_data = folio->memcg_data;
3424
3425	if (folio_memcg_kmem(folio))
3426		obj_cgroup_get_many(__folio_objcg(folio), nr - 1);
3427	else
3428		css_get_many(&memcg->css, nr - 1);
3429}
3430
3431#ifdef CONFIG_SWAP
3432/**
3433 * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
3434 * @entry: swap entry to be moved
3435 * @from:  mem_cgroup which the entry is moved from
3436 * @to:  mem_cgroup which the entry is moved to
3437 *
3438 * It succeeds only when the swap_cgroup's record for this entry is the same
3439 * as the mem_cgroup's id of @from.
3440 *
3441 * Returns 0 on success, -EINVAL on failure.
3442 *
3443 * The caller must have charged to @to, IOW, called page_counter_charge() about
3444 * both res and memsw, and called css_get().
3445 */
3446static int mem_cgroup_move_swap_account(swp_entry_t entry,
3447				struct mem_cgroup *from, struct mem_cgroup *to)
3448{
3449	unsigned short old_id, new_id;
3450
3451	old_id = mem_cgroup_id(from);
3452	new_id = mem_cgroup_id(to);
3453
3454	if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
3455		mod_memcg_state(from, MEMCG_SWAP, -1);
3456		mod_memcg_state(to, MEMCG_SWAP, 1);
3457		return 0;
3458	}
3459	return -EINVAL;
3460}
3461#else
3462static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
3463				struct mem_cgroup *from, struct mem_cgroup *to)
3464{
3465	return -EINVAL;
3466}
3467#endif
3468
3469static DEFINE_MUTEX(memcg_max_mutex);
3470
3471static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
3472				 unsigned long max, bool memsw)
3473{
3474	bool enlarge = false;
3475	bool drained = false;
3476	int ret;
3477	bool limits_invariant;
3478	struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory;
3479
3480	do {
3481		if (signal_pending(current)) {
3482			ret = -EINTR;
3483			break;
3484		}
3485
3486		mutex_lock(&memcg_max_mutex);
3487		/*
3488		 * Make sure that the new limit (memsw or memory limit) doesn't
3489		 * break our basic invariant rule memory.max <= memsw.max.
3490		 */
3491		limits_invariant = memsw ? max >= READ_ONCE(memcg->memory.max) :
3492					   max <= memcg->memsw.max;
3493		if (!limits_invariant) {
3494			mutex_unlock(&memcg_max_mutex);
3495			ret = -EINVAL;
3496			break;
3497		}
3498		if (max > counter->max)
3499			enlarge = true;
3500		ret = page_counter_set_max(counter, max);
3501		mutex_unlock(&memcg_max_mutex);
3502
3503		if (!ret)
3504			break;
3505
3506		if (!drained) {
3507			drain_all_stock(memcg);
3508			drained = true;
3509			continue;
3510		}
3511
3512		if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
3513					memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP)) {
3514			ret = -EBUSY;
3515			break;
3516		}
3517	} while (true);
3518
3519	if (!ret && enlarge)
3520		memcg_oom_recover(memcg);
3521
3522	return ret;
3523}
3524
3525unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
3526					    gfp_t gfp_mask,
3527					    unsigned long *total_scanned)
3528{
3529	unsigned long nr_reclaimed = 0;
3530	struct mem_cgroup_per_node *mz, *next_mz = NULL;
3531	unsigned long reclaimed;
3532	int loop = 0;
3533	struct mem_cgroup_tree_per_node *mctz;
3534	unsigned long excess;
3535
3536	if (lru_gen_enabled())
3537		return 0;
3538
3539	if (order > 0)
3540		return 0;
3541
3542	mctz = soft_limit_tree.rb_tree_per_node[pgdat->node_id];
3543
3544	/*
3545	 * Do not even bother to check the largest node if the root
3546	 * is empty. Do it lockless to prevent lock bouncing. Races
3547	 * are acceptable as soft limit is best effort anyway.
3548	 */
3549	if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root))
3550		return 0;
3551
3552	/*
3553	 * This loop can run a while, specially if mem_cgroup's continuously
3554	 * keep exceeding their soft limit and putting the system under
3555	 * pressure
3556	 */
3557	do {
3558		if (next_mz)
3559			mz = next_mz;
3560		else
3561			mz = mem_cgroup_largest_soft_limit_node(mctz);
3562		if (!mz)
3563			break;
3564
3565		reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat,
3566						    gfp_mask, total_scanned);
3567		nr_reclaimed += reclaimed;
3568		spin_lock_irq(&mctz->lock);
3569
3570		/*
3571		 * If we failed to reclaim anything from this memory cgroup
3572		 * it is time to move on to the next cgroup
3573		 */
3574		next_mz = NULL;
3575		if (!reclaimed)
3576			next_mz = __mem_cgroup_largest_soft_limit_node(mctz);
3577
3578		excess = soft_limit_excess(mz->memcg);
3579		/*
3580		 * One school of thought says that we should not add
3581		 * back the node to the tree if reclaim returns 0.
3582		 * But our reclaim could return 0, simply because due
3583		 * to priority we are exposing a smaller subset of
3584		 * memory to reclaim from. Consider this as a longer
3585		 * term TODO.
3586		 */
3587		/* If excess == 0, no tree ops */
3588		__mem_cgroup_insert_exceeded(mz, mctz, excess);
3589		spin_unlock_irq(&mctz->lock);
3590		css_put(&mz->memcg->css);
3591		loop++;
3592		/*
3593		 * Could not reclaim anything and there are no more
3594		 * mem cgroups to try or we seem to be looping without
3595		 * reclaiming anything.
3596		 */
3597		if (!nr_reclaimed &&
3598			(next_mz == NULL ||
3599			loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
3600			break;
3601	} while (!nr_reclaimed);
3602	if (next_mz)
3603		css_put(&next_mz->memcg->css);
3604	return nr_reclaimed;
3605}
3606
3607/*
3608 * Reclaims as many pages from the given memcg as possible.
3609 *
3610 * Caller is responsible for holding css reference for memcg.
3611 */
3612static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
3613{
3614	int nr_retries = MAX_RECLAIM_RETRIES;
3615
3616	/* we call try-to-free pages for make this cgroup empty */
3617	lru_add_drain_all();
3618
3619	drain_all_stock(memcg);
3620
3621	/* try to free all pages in this cgroup */
3622	while (nr_retries && page_counter_read(&memcg->memory)) {
3623		if (signal_pending(current))
3624			return -EINTR;
3625
3626		if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
3627						  MEMCG_RECLAIM_MAY_SWAP))
3628			nr_retries--;
3629	}
3630
3631	return 0;
3632}
3633
3634static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of,
3635					    char *buf, size_t nbytes,
3636					    loff_t off)
3637{
3638	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3639
3640	if (mem_cgroup_is_root(memcg))
3641		return -EINVAL;
3642	return mem_cgroup_force_empty(memcg) ?: nbytes;
3643}
3644
3645static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
3646				     struct cftype *cft)
3647{
3648	return 1;
3649}
3650
3651static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
3652				      struct cftype *cft, u64 val)
3653{
3654	if (val == 1)
3655		return 0;
3656
3657	pr_warn_once("Non-hierarchical mode is deprecated. "
3658		     "Please report your usecase to linux-mm@kvack.org if you "
3659		     "depend on this functionality.\n");
3660
3661	return -EINVAL;
3662}
3663
3664static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
3665{
3666	unsigned long val;
3667
3668	if (mem_cgroup_is_root(memcg)) {
3669		/*
3670		 * Approximate root's usage from global state. This isn't
3671		 * perfect, but the root usage was always an approximation.
3672		 */
3673		val = global_node_page_state(NR_FILE_PAGES) +
3674			global_node_page_state(NR_ANON_MAPPED);
3675		if (swap)
3676			val += total_swap_pages - get_nr_swap_pages();
3677	} else {
3678		if (!swap)
3679			val = page_counter_read(&memcg->memory);
3680		else
3681			val = page_counter_read(&memcg->memsw);
3682	}
3683	return val;
3684}
3685
3686enum {
3687	RES_USAGE,
3688	RES_LIMIT,
3689	RES_MAX_USAGE,
3690	RES_FAILCNT,
3691	RES_SOFT_LIMIT,
3692};
3693
3694static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
3695			       struct cftype *cft)
3696{
3697	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3698	struct page_counter *counter;
3699
3700	switch (MEMFILE_TYPE(cft->private)) {
3701	case _MEM:
3702		counter = &memcg->memory;
3703		break;
3704	case _MEMSWAP:
3705		counter = &memcg->memsw;
3706		break;
3707	case _KMEM:
3708		counter = &memcg->kmem;
3709		break;
3710	case _TCP:
3711		counter = &memcg->tcpmem;
3712		break;
3713	default:
3714		BUG();
3715	}
3716
3717	switch (MEMFILE_ATTR(cft->private)) {
3718	case RES_USAGE:
3719		if (counter == &memcg->memory)
3720			return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE;
3721		if (counter == &memcg->memsw)
3722			return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE;
3723		return (u64)page_counter_read(counter) * PAGE_SIZE;
3724	case RES_LIMIT:
3725		return (u64)counter->max * PAGE_SIZE;
3726	case RES_MAX_USAGE:
3727		return (u64)counter->watermark * PAGE_SIZE;
3728	case RES_FAILCNT:
3729		return counter->failcnt;
3730	case RES_SOFT_LIMIT:
3731		return (u64)READ_ONCE(memcg->soft_limit) * PAGE_SIZE;
3732	default:
3733		BUG();
3734	}
3735}
3736
3737/*
3738 * This function doesn't do anything useful. Its only job is to provide a read
3739 * handler for a file so that cgroup_file_mode() will add read permissions.
3740 */
3741static int mem_cgroup_dummy_seq_show(__always_unused struct seq_file *m,
3742				     __always_unused void *v)
3743{
3744	return -EINVAL;
3745}
3746
3747#ifdef CONFIG_MEMCG_KMEM
3748static int memcg_online_kmem(struct mem_cgroup *memcg)
3749{
3750	struct obj_cgroup *objcg;
3751
3752	if (mem_cgroup_kmem_disabled())
3753		return 0;
3754
3755	if (unlikely(mem_cgroup_is_root(memcg)))
3756		return 0;
3757
3758	objcg = obj_cgroup_alloc();
3759	if (!objcg)
3760		return -ENOMEM;
3761
3762	objcg->memcg = memcg;
3763	rcu_assign_pointer(memcg->objcg, objcg);
3764
3765	static_branch_enable(&memcg_kmem_online_key);
3766
3767	memcg->kmemcg_id = memcg->id.id;
3768
3769	return 0;
3770}
3771
3772static void memcg_offline_kmem(struct mem_cgroup *memcg)
3773{
3774	struct mem_cgroup *parent;
3775
3776	if (mem_cgroup_kmem_disabled())
3777		return;
3778
3779	if (unlikely(mem_cgroup_is_root(memcg)))
3780		return;
3781
3782	parent = parent_mem_cgroup(memcg);
3783	if (!parent)
3784		parent = root_mem_cgroup;
3785
3786	memcg_reparent_objcgs(memcg, parent);
3787
3788	/*
3789	 * After we have finished memcg_reparent_objcgs(), all list_lrus
3790	 * corresponding to this cgroup are guaranteed to remain empty.
3791	 * The ordering is imposed by list_lru_node->lock taken by
3792	 * memcg_reparent_list_lrus().
3793	 */
3794	memcg_reparent_list_lrus(memcg, parent);
3795}
3796#else
3797static int memcg_online_kmem(struct mem_cgroup *memcg)
3798{
3799	return 0;
3800}
3801static void memcg_offline_kmem(struct mem_cgroup *memcg)
3802{
3803}
3804#endif /* CONFIG_MEMCG_KMEM */
3805
3806static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max)
3807{
3808	int ret;
3809
3810	mutex_lock(&memcg_max_mutex);
3811
3812	ret = page_counter_set_max(&memcg->tcpmem, max);
3813	if (ret)
3814		goto out;
3815
3816	if (!memcg->tcpmem_active) {
3817		/*
3818		 * The active flag needs to be written after the static_key
3819		 * update. This is what guarantees that the socket activation
3820		 * function is the last one to run. See mem_cgroup_sk_alloc()
3821		 * for details, and note that we don't mark any socket as
3822		 * belonging to this memcg until that flag is up.
3823		 *
3824		 * We need to do this, because static_keys will span multiple
3825		 * sites, but we can't control their order. If we mark a socket
3826		 * as accounted, but the accounting functions are not patched in
3827		 * yet, we'll lose accounting.
3828		 *
3829		 * We never race with the readers in mem_cgroup_sk_alloc(),
3830		 * because when this value change, the code to process it is not
3831		 * patched in yet.
3832		 */
3833		static_branch_inc(&memcg_sockets_enabled_key);
3834		memcg->tcpmem_active = true;
3835	}
3836out:
3837	mutex_unlock(&memcg_max_mutex);
3838	return ret;
3839}
3840
3841/*
3842 * The user of this function is...
3843 * RES_LIMIT.
3844 */
3845static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
3846				char *buf, size_t nbytes, loff_t off)
3847{
3848	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3849	unsigned long nr_pages;
3850	int ret;
3851
3852	buf = strstrip(buf);
3853	ret = page_counter_memparse(buf, "-1", &nr_pages);
3854	if (ret)
3855		return ret;
3856
3857	switch (MEMFILE_ATTR(of_cft(of)->private)) {
3858	case RES_LIMIT:
3859		if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
3860			ret = -EINVAL;
3861			break;
3862		}
3863		switch (MEMFILE_TYPE(of_cft(of)->private)) {
3864		case _MEM:
3865			ret = mem_cgroup_resize_max(memcg, nr_pages, false);
3866			break;
3867		case _MEMSWAP:
3868			ret = mem_cgroup_resize_max(memcg, nr_pages, true);
3869			break;
3870		case _TCP:
3871			ret = memcg_update_tcp_max(memcg, nr_pages);
3872			break;
3873		}
3874		break;
3875	case RES_SOFT_LIMIT:
3876		if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
3877			ret = -EOPNOTSUPP;
3878		} else {
3879			WRITE_ONCE(memcg->soft_limit, nr_pages);
3880			ret = 0;
3881		}
3882		break;
3883	}
3884	return ret ?: nbytes;
3885}
3886
3887static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
3888				size_t nbytes, loff_t off)
3889{
3890	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3891	struct page_counter *counter;
3892
3893	switch (MEMFILE_TYPE(of_cft(of)->private)) {
3894	case _MEM:
3895		counter = &memcg->memory;
3896		break;
3897	case _MEMSWAP:
3898		counter = &memcg->memsw;
3899		break;
3900	case _KMEM:
3901		counter = &memcg->kmem;
3902		break;
3903	case _TCP:
3904		counter = &memcg->tcpmem;
3905		break;
3906	default:
3907		BUG();
3908	}
3909
3910	switch (MEMFILE_ATTR(of_cft(of)->private)) {
3911	case RES_MAX_USAGE:
3912		page_counter_reset_watermark(counter);
3913		break;
3914	case RES_FAILCNT:
3915		counter->failcnt = 0;
3916		break;
3917	default:
3918		BUG();
3919	}
3920
3921	return nbytes;
3922}
3923
3924static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
3925					struct cftype *cft)
3926{
3927	return mem_cgroup_from_css(css)->move_charge_at_immigrate;
3928}
3929
3930#ifdef CONFIG_MMU
3931static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
3932					struct cftype *cft, u64 val)
3933{
3934	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3935
3936	pr_warn_once("Cgroup memory moving (move_charge_at_immigrate) is deprecated. "
3937		     "Please report your usecase to linux-mm@kvack.org if you "
3938		     "depend on this functionality.\n");
3939
3940	if (val & ~MOVE_MASK)
3941		return -EINVAL;
3942
3943	/*
3944	 * No kind of locking is needed in here, because ->can_attach() will
3945	 * check this value once in the beginning of the process, and then carry
3946	 * on with stale data. This means that changes to this value will only
3947	 * affect task migrations starting after the change.
3948	 */
3949	memcg->move_charge_at_immigrate = val;
3950	return 0;
3951}
3952#else
3953static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
3954					struct cftype *cft, u64 val)
3955{
3956	return -ENOSYS;
3957}
3958#endif
3959
3960#ifdef CONFIG_NUMA
3961
3962#define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE))
3963#define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON))
3964#define LRU_ALL	     ((1 << NR_LRU_LISTS) - 1)
3965
3966static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
3967				int nid, unsigned int lru_mask, bool tree)
3968{
3969	struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
3970	unsigned long nr = 0;
3971	enum lru_list lru;
3972
3973	VM_BUG_ON((unsigned)nid >= nr_node_ids);
3974
3975	for_each_lru(lru) {
3976		if (!(BIT(lru) & lru_mask))
3977			continue;
3978		if (tree)
3979			nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru);
3980		else
3981			nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru);
3982	}
3983	return nr;
3984}
3985
3986static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
3987					     unsigned int lru_mask,
3988					     bool tree)
3989{
3990	unsigned long nr = 0;
3991	enum lru_list lru;
3992
3993	for_each_lru(lru) {
3994		if (!(BIT(lru) & lru_mask))
3995			continue;
3996		if (tree)
3997			nr += memcg_page_state(memcg, NR_LRU_BASE + lru);
3998		else
3999			nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru);
4000	}
4001	return nr;
4002}
4003
4004static int memcg_numa_stat_show(struct seq_file *m, void *v)
4005{
4006	struct numa_stat {
4007		const char *name;
4008		unsigned int lru_mask;
4009	};
4010
4011	static const struct numa_stat stats[] = {
4012		{ "total", LRU_ALL },
4013		{ "file", LRU_ALL_FILE },
4014		{ "anon", LRU_ALL_ANON },
4015		{ "unevictable", BIT(LRU_UNEVICTABLE) },
4016	};
4017	const struct numa_stat *stat;
4018	int nid;
4019	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
4020
4021	mem_cgroup_flush_stats();
4022
4023	for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
4024		seq_printf(m, "%s=%lu", stat->name,
4025			   mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
4026						   false));
4027		for_each_node_state(nid, N_MEMORY)
4028			seq_printf(m, " N%d=%lu", nid,
4029				   mem_cgroup_node_nr_lru_pages(memcg, nid,
4030							stat->lru_mask, false));
4031		seq_putc(m, '\n');
4032	}
4033
4034	for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
4035
4036		seq_printf(m, "hierarchical_%s=%lu", stat->name,
4037			   mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
4038						   true));
4039		for_each_node_state(nid, N_MEMORY)
4040			seq_printf(m, " N%d=%lu", nid,
4041				   mem_cgroup_node_nr_lru_pages(memcg, nid,
4042							stat->lru_mask, true));
4043		seq_putc(m, '\n');
4044	}
4045
4046	return 0;
4047}
4048#endif /* CONFIG_NUMA */
4049
4050static const unsigned int memcg1_stats[] = {
4051	NR_FILE_PAGES,
4052	NR_ANON_MAPPED,
4053#ifdef CONFIG_TRANSPARENT_HUGEPAGE
4054	NR_ANON_THPS,
4055#endif
4056	NR_SHMEM,
4057	NR_FILE_MAPPED,
4058	NR_FILE_DIRTY,
4059	NR_WRITEBACK,
4060	WORKINGSET_REFAULT_ANON,
4061	WORKINGSET_REFAULT_FILE,
4062	MEMCG_SWAP,
4063};
4064
4065static const char *const memcg1_stat_names[] = {
4066	"cache",
4067	"rss",
4068#ifdef CONFIG_TRANSPARENT_HUGEPAGE
4069	"rss_huge",
4070#endif
4071	"shmem",
4072	"mapped_file",
4073	"dirty",
4074	"writeback",
4075	"workingset_refault_anon",
4076	"workingset_refault_file",
4077	"swap",
4078};
4079
4080/* Universal VM events cgroup1 shows, original sort order */
4081static const unsigned int memcg1_events[] = {
4082	PGPGIN,
4083	PGPGOUT,
4084	PGFAULT,
4085	PGMAJFAULT,
4086};
4087
4088static void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
4089{
4090	unsigned long memory, memsw;
4091	struct mem_cgroup *mi;
4092	unsigned int i;
4093
4094	BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
4095
4096	mem_cgroup_flush_stats();
4097
4098	for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
4099		unsigned long nr;
4100
4101		if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
4102			continue;
4103		nr = memcg_page_state_local(memcg, memcg1_stats[i]);
4104		seq_buf_printf(s, "%s %lu\n", memcg1_stat_names[i],
4105			   nr * memcg_page_state_unit(memcg1_stats[i]));
4106	}
4107
4108	for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
4109		seq_buf_printf(s, "%s %lu\n", vm_event_name(memcg1_events[i]),
4110			       memcg_events_local(memcg, memcg1_events[i]));
4111
4112	for (i = 0; i < NR_LRU_LISTS; i++)
4113		seq_buf_printf(s, "%s %lu\n", lru_list_name(i),
4114			       memcg_page_state_local(memcg, NR_LRU_BASE + i) *
4115			       PAGE_SIZE);
4116
4117	/* Hierarchical information */
4118	memory = memsw = PAGE_COUNTER_MAX;
4119	for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {
4120		memory = min(memory, READ_ONCE(mi->memory.max));
4121		memsw = min(memsw, READ_ONCE(mi->memsw.max));
4122	}
4123	seq_buf_printf(s, "hierarchical_memory_limit %llu\n",
4124		       (u64)memory * PAGE_SIZE);
4125	if (do_memsw_account())
4126		seq_buf_printf(s, "hierarchical_memsw_limit %llu\n",
4127			       (u64)memsw * PAGE_SIZE);
4128
4129	for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
4130		unsigned long nr;
4131
4132		if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
4133			continue;
4134		nr = memcg_page_state(memcg, memcg1_stats[i]);
4135		seq_buf_printf(s, "total_%s %llu\n", memcg1_stat_names[i],
4136			   (u64)nr * memcg_page_state_unit(memcg1_stats[i]));
4137	}
4138
4139	for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
4140		seq_buf_printf(s, "total_%s %llu\n",
4141			       vm_event_name(memcg1_events[i]),
4142			       (u64)memcg_events(memcg, memcg1_events[i]));
4143
4144	for (i = 0; i < NR_LRU_LISTS; i++)
4145		seq_buf_printf(s, "total_%s %llu\n", lru_list_name(i),
4146			       (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
4147			       PAGE_SIZE);
4148
4149#ifdef CONFIG_DEBUG_VM
4150	{
4151		pg_data_t *pgdat;
4152		struct mem_cgroup_per_node *mz;
4153		unsigned long anon_cost = 0;
4154		unsigned long file_cost = 0;
4155
4156		for_each_online_pgdat(pgdat) {
4157			mz = memcg->nodeinfo[pgdat->node_id];
4158
4159			anon_cost += mz->lruvec.anon_cost;
4160			file_cost += mz->lruvec.file_cost;
4161		}
4162		seq_buf_printf(s, "anon_cost %lu\n", anon_cost);
4163		seq_buf_printf(s, "file_cost %lu\n", file_cost);
4164	}
4165#endif
4166}
4167
4168static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css,
4169				      struct cftype *cft)
4170{
4171	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4172
4173	return mem_cgroup_swappiness(memcg);
4174}
4175
4176static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
4177				       struct cftype *cft, u64 val)
4178{
4179	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4180
4181	if (val > 200)
4182		return -EINVAL;
4183
4184	if (!mem_cgroup_is_root(memcg))
4185		WRITE_ONCE(memcg->swappiness, val);
4186	else
4187		WRITE_ONCE(vm_swappiness, val);
4188
4189	return 0;
4190}
4191
4192static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
4193{
4194	struct mem_cgroup_threshold_ary *t;
4195	unsigned long usage;
4196	int i;
4197
4198	rcu_read_lock();
4199	if (!swap)
4200		t = rcu_dereference(memcg->thresholds.primary);
4201	else
4202		t = rcu_dereference(memcg->memsw_thresholds.primary);
4203
4204	if (!t)
4205		goto unlock;
4206
4207	usage = mem_cgroup_usage(memcg, swap);
4208
4209	/*
4210	 * current_threshold points to threshold just below or equal to usage.
4211	 * If it's not true, a threshold was crossed after last
4212	 * call of __mem_cgroup_threshold().
4213	 */
4214	i = t->current_threshold;
4215
4216	/*
4217	 * Iterate backward over array of thresholds starting from
4218	 * current_threshold and check if a threshold is crossed.
4219	 * If none of thresholds below usage is crossed, we read
4220	 * only one element of the array here.
4221	 */
4222	for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
4223		eventfd_signal(t->entries[i].eventfd, 1);
4224
4225	/* i = current_threshold + 1 */
4226	i++;
4227
4228	/*
4229	 * Iterate forward over array of thresholds starting from
4230	 * current_threshold+1 and check if a threshold is crossed.
4231	 * If none of thresholds above usage is crossed, we read
4232	 * only one element of the array here.
4233	 */
4234	for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
4235		eventfd_signal(t->entries[i].eventfd, 1);
4236
4237	/* Update current_threshold */
4238	t->current_threshold = i - 1;
4239unlock:
4240	rcu_read_unlock();
4241}
4242
4243static void mem_cgroup_threshold(struct mem_cgroup *memcg)
4244{
4245	while (memcg) {
4246		__mem_cgroup_threshold(memcg, false);
4247		if (do_memsw_account())
4248			__mem_cgroup_threshold(memcg, true);
4249
4250		memcg = parent_mem_cgroup(memcg);
4251	}
4252}
4253
4254static int compare_thresholds(const void *a, const void *b)
4255{
4256	const struct mem_cgroup_threshold *_a = a;
4257	const struct mem_cgroup_threshold *_b = b;
4258
4259	if (_a->threshold > _b->threshold)
4260		return 1;
4261
4262	if (_a->threshold < _b->threshold)
4263		return -1;
4264
4265	return 0;
4266}
4267
4268static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
4269{
4270	struct mem_cgroup_eventfd_list *ev;
4271
4272	spin_lock(&memcg_oom_lock);
4273
4274	list_for_each_entry(ev, &memcg->oom_notify, list)
4275		eventfd_signal(ev->eventfd, 1);
4276
4277	spin_unlock(&memcg_oom_lock);
4278	return 0;
4279}
4280
4281static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
4282{
4283	struct mem_cgroup *iter;
4284
4285	for_each_mem_cgroup_tree(iter, memcg)
4286		mem_cgroup_oom_notify_cb(iter);
4287}
4288
4289static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
4290	struct eventfd_ctx *eventfd, const char *args, enum res_type type)
4291{
4292	struct mem_cgroup_thresholds *thresholds;
4293	struct mem_cgroup_threshold_ary *new;
4294	unsigned long threshold;
4295	unsigned long usage;
4296	int i, size, ret;
4297
4298	ret = page_counter_memparse(args, "-1", &threshold);
4299	if (ret)
4300		return ret;
4301
4302	mutex_lock(&memcg->thresholds_lock);
4303
4304	if (type == _MEM) {
4305		thresholds = &memcg->thresholds;
4306		usage = mem_cgroup_usage(memcg, false);
4307	} else if (type == _MEMSWAP) {
4308		thresholds = &memcg->memsw_thresholds;
4309		usage = mem_cgroup_usage(memcg, true);
4310	} else
4311		BUG();
4312
4313	/* Check if a threshold crossed before adding a new one */
4314	if (thresholds->primary)
4315		__mem_cgroup_threshold(memcg, type == _MEMSWAP);
4316
4317	size = thresholds->primary ? thresholds->primary->size + 1 : 1;
4318
4319	/* Allocate memory for new array of thresholds */
4320	new = kmalloc(struct_size(new, entries, size), GFP_KERNEL);
4321	if (!new) {
4322		ret = -ENOMEM;
4323		goto unlock;
4324	}
4325	new->size = size;
4326
4327	/* Copy thresholds (if any) to new array */
4328	if (thresholds->primary)
4329		memcpy(new->entries, thresholds->primary->entries,
4330		       flex_array_size(new, entries, size - 1));
4331
4332	/* Add new threshold */
4333	new->entries[size - 1].eventfd = eventfd;
4334	new->entries[size - 1].threshold = threshold;
4335
4336	/* Sort thresholds. Registering of new threshold isn't time-critical */
4337	sort(new->entries, size, sizeof(*new->entries),
4338			compare_thresholds, NULL);
4339
4340	/* Find current threshold */
4341	new->current_threshold = -1;
4342	for (i = 0; i < size; i++) {
4343		if (new->entries[i].threshold <= usage) {
4344			/*
4345			 * new->current_threshold will not be used until
4346			 * rcu_assign_pointer(), so it's safe to increment
4347			 * it here.
4348			 */
4349			++new->current_threshold;
4350		} else
4351			break;
4352	}
4353
4354	/* Free old spare buffer and save old primary buffer as spare */
4355	kfree(thresholds->spare);
4356	thresholds->spare = thresholds->primary;
4357
4358	rcu_assign_pointer(thresholds->primary, new);
4359
4360	/* To be sure that nobody uses thresholds */
4361	synchronize_rcu();
4362
4363unlock:
4364	mutex_unlock(&memcg->thresholds_lock);
4365
4366	return ret;
4367}
4368
4369static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
4370	struct eventfd_ctx *eventfd, const char *args)
4371{
4372	return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
4373}
4374
4375static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
4376	struct eventfd_ctx *eventfd, const char *args)
4377{
4378	return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
4379}
4380
4381static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
4382	struct eventfd_ctx *eventfd, enum res_type type)
4383{
4384	struct mem_cgroup_thresholds *thresholds;
4385	struct mem_cgroup_threshold_ary *new;
4386	unsigned long usage;
4387	int i, j, size, entries;
4388
4389	mutex_lock(&memcg->thresholds_lock);
4390
4391	if (type == _MEM) {
4392		thresholds = &memcg->thresholds;
4393		usage = mem_cgroup_usage(memcg, false);
4394	} else if (type == _MEMSWAP) {
4395		thresholds = &memcg->memsw_thresholds;
4396		usage = mem_cgroup_usage(memcg, true);
4397	} else
4398		BUG();
4399
4400	if (!thresholds->primary)
4401		goto unlock;
4402
4403	/* Check if a threshold crossed before removing */
4404	__mem_cgroup_threshold(memcg, type == _MEMSWAP);
4405
4406	/* Calculate new number of threshold */
4407	size = entries = 0;
4408	for (i = 0; i < thresholds->primary->size; i++) {
4409		if (thresholds->primary->entries[i].eventfd != eventfd)
4410			size++;
4411		else
4412			entries++;
4413	}
4414
4415	new = thresholds->spare;
4416
4417	/* If no items related to eventfd have been cleared, nothing to do */
4418	if (!entries)
4419		goto unlock;
4420
4421	/* Set thresholds array to NULL if we don't have thresholds */
4422	if (!size) {
4423		kfree(new);
4424		new = NULL;
4425		goto swap_buffers;
4426	}
4427
4428	new->size = size;
4429
4430	/* Copy thresholds and find current threshold */
4431	new->current_threshold = -1;
4432	for (i = 0, j = 0; i < thresholds->primary->size; i++) {
4433		if (thresholds->primary->entries[i].eventfd == eventfd)
4434			continue;
4435
4436		new->entries[j] = thresholds->primary->entries[i];
4437		if (new->entries[j].threshold <= usage) {
4438			/*
4439			 * new->current_threshold will not be used
4440			 * until rcu_assign_pointer(), so it's safe to increment
4441			 * it here.
4442			 */
4443			++new->current_threshold;
4444		}
4445		j++;
4446	}
4447
4448swap_buffers:
4449	/* Swap primary and spare array */
4450	thresholds->spare = thresholds->primary;
4451
4452	rcu_assign_pointer(thresholds->primary, new);
4453
4454	/* To be sure that nobody uses thresholds */
4455	synchronize_rcu();
4456
4457	/* If all events are unregistered, free the spare array */
4458	if (!new) {
4459		kfree(thresholds->spare);
4460		thresholds->spare = NULL;
4461	}
4462unlock:
4463	mutex_unlock(&memcg->thresholds_lock);
4464}
4465
4466static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
4467	struct eventfd_ctx *eventfd)
4468{
4469	return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
4470}
4471
4472static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
4473	struct eventfd_ctx *eventfd)
4474{
4475	return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
4476}
4477
4478static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
4479	struct eventfd_ctx *eventfd, const char *args)
4480{
4481	struct mem_cgroup_eventfd_list *event;
4482
4483	event = kmalloc(sizeof(*event),	GFP_KERNEL);
4484	if (!event)
4485		return -ENOMEM;
4486
4487	spin_lock(&memcg_oom_lock);
4488
4489	event->eventfd = eventfd;
4490	list_add(&event->list, &memcg->oom_notify);
4491
4492	/* already in OOM ? */
4493	if (memcg->under_oom)
4494		eventfd_signal(eventfd, 1);
4495	spin_unlock(&memcg_oom_lock);
4496
4497	return 0;
4498}
4499
4500static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
4501	struct eventfd_ctx *eventfd)
4502{
4503	struct mem_cgroup_eventfd_list *ev, *tmp;
4504
4505	spin_lock(&memcg_oom_lock);
4506
4507	list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
4508		if (ev->eventfd == eventfd) {
4509			list_del(&ev->list);
4510			kfree(ev);
4511		}
4512	}
4513
4514	spin_unlock(&memcg_oom_lock);
4515}
4516
4517static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
4518{
4519	struct mem_cgroup *memcg = mem_cgroup_from_seq(sf);
4520
4521	seq_printf(sf, "oom_kill_disable %d\n", READ_ONCE(memcg->oom_kill_disable));
4522	seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
4523	seq_printf(sf, "oom_kill %lu\n",
4524		   atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL]));
4525	return 0;
4526}
4527
4528static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
4529	struct cftype *cft, u64 val)
4530{
4531	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4532
4533	/* cannot set to root cgroup and only 0 and 1 are allowed */
4534	if (mem_cgroup_is_root(memcg) || !((val == 0) || (val == 1)))
4535		return -EINVAL;
4536
4537	WRITE_ONCE(memcg->oom_kill_disable, val);
4538	if (!val)
4539		memcg_oom_recover(memcg);
4540
4541	return 0;
4542}
4543
4544#ifdef CONFIG_CGROUP_WRITEBACK
4545
4546#include <trace/events/writeback.h>
4547
4548static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
4549{
4550	return wb_domain_init(&memcg->cgwb_domain, gfp);
4551}
4552
4553static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
4554{
4555	wb_domain_exit(&memcg->cgwb_domain);
4556}
4557
4558static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
4559{
4560	wb_domain_size_changed(&memcg->cgwb_domain);
4561}
4562
4563struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
4564{
4565	struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
4566
4567	if (!memcg->css.parent)
4568		return NULL;
4569
4570	return &memcg->cgwb_domain;
4571}
4572
4573/**
4574 * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
4575 * @wb: bdi_writeback in question
4576 * @pfilepages: out parameter for number of file pages
4577 * @pheadroom: out parameter for number of allocatable pages according to memcg
4578 * @pdirty: out parameter for number of dirty pages
4579 * @pwriteback: out parameter for number of pages under writeback
4580 *
4581 * Determine the numbers of file, headroom, dirty, and writeback pages in
4582 * @wb's memcg.  File, dirty and writeback are self-explanatory.  Headroom
4583 * is a bit more involved.
4584 *
4585 * A memcg's headroom is "min(max, high) - used".  In the hierarchy, the
4586 * headroom is calculated as the lowest headroom of itself and the
4587 * ancestors.  Note that this doesn't consider the actual amount of
4588 * available memory in the system.  The caller should further cap
4589 * *@pheadroom accordingly.
4590 */
4591void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
4592			 unsigned long *pheadroom, unsigned long *pdirty,
4593			 unsigned long *pwriteback)
4594{
4595	struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
4596	struct mem_cgroup *parent;
4597
4598	mem_cgroup_flush_stats();
4599
4600	*pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
4601	*pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
4602	*pfilepages = memcg_page_state(memcg, NR_INACTIVE_FILE) +
4603			memcg_page_state(memcg, NR_ACTIVE_FILE);
4604
4605	*pheadroom = PAGE_COUNTER_MAX;
4606	while ((parent = parent_mem_cgroup(memcg))) {
4607		unsigned long ceiling = min(READ_ONCE(memcg->memory.max),
4608					    READ_ONCE(memcg->memory.high));
4609		unsigned long used = page_counter_read(&memcg->memory);
4610
4611		*pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
4612		memcg = parent;
4613	}
4614}
4615
4616/*
4617 * Foreign dirty flushing
4618 *
4619 * There's an inherent mismatch between memcg and writeback.  The former
4620 * tracks ownership per-page while the latter per-inode.  This was a
4621 * deliberate design decision because honoring per-page ownership in the
4622 * writeback path is complicated, may lead to higher CPU and IO overheads
4623 * and deemed unnecessary given that write-sharing an inode across
4624 * different cgroups isn't a common use-case.
4625 *
4626 * Combined with inode majority-writer ownership switching, this works well
4627 * enough in most cases but there are some pathological cases.  For
4628 * example, let's say there are two cgroups A and B which keep writing to
4629 * different but confined parts of the same inode.  B owns the inode and
4630 * A's memory is limited far below B's.  A's dirty ratio can rise enough to
4631 * trigger balance_dirty_pages() sleeps but B's can be low enough to avoid
4632 * triggering background writeback.  A will be slowed down without a way to
4633 * make writeback of the dirty pages happen.
4634 *
4635 * Conditions like the above can lead to a cgroup getting repeatedly and
4636 * severely throttled after making some progress after each
4637 * dirty_expire_interval while the underlying IO device is almost
4638 * completely idle.
4639 *
4640 * Solving this problem completely requires matching the ownership tracking
4641 * granularities between memcg and writeback in either direction.  However,
4642 * the more egregious behaviors can be avoided by simply remembering the
4643 * most recent foreign dirtying events and initiating remote flushes on
4644 * them when local writeback isn't enough to keep the memory clean enough.
4645 *
4646 * The following two functions implement such mechanism.  When a foreign
4647 * page - a page whose memcg and writeback ownerships don't match - is
4648 * dirtied, mem_cgroup_track_foreign_dirty() records the inode owning
4649 * bdi_writeback on the page owning memcg.  When balance_dirty_pages()
4650 * decides that the memcg needs to sleep due to high dirty ratio, it calls
4651 * mem_cgroup_flush_foreign() which queues writeback on the recorded
4652 * foreign bdi_writebacks which haven't expired.  Both the numbers of
4653 * recorded bdi_writebacks and concurrent in-flight foreign writebacks are
4654 * limited to MEMCG_CGWB_FRN_CNT.
4655 *
4656 * The mechanism only remembers IDs and doesn't hold any object references.
4657 * As being wrong occasionally doesn't matter, updates and accesses to the
4658 * records are lockless and racy.
4659 */
4660void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio,
4661					     struct bdi_writeback *wb)
4662{
4663	struct mem_cgroup *memcg = folio_memcg(folio);
4664	struct memcg_cgwb_frn *frn;
4665	u64 now = get_jiffies_64();
4666	u64 oldest_at = now;
4667	int oldest = -1;
4668	int i;
4669
4670	trace_track_foreign_dirty(folio, wb);
4671
4672	/*
4673	 * Pick the slot to use.  If there is already a slot for @wb, keep
4674	 * using it.  If not replace the oldest one which isn't being
4675	 * written out.
4676	 */
4677	for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
4678		frn = &memcg->cgwb_frn[i];
4679		if (frn->bdi_id == wb->bdi->id &&
4680		    frn->memcg_id == wb->memcg_css->id)
4681			break;
4682		if (time_before64(frn->at, oldest_at) &&
4683		    atomic_read(&frn->done.cnt) == 1) {
4684			oldest = i;
4685			oldest_at = frn->at;
4686		}
4687	}
4688
4689	if (i < MEMCG_CGWB_FRN_CNT) {
4690		/*
4691		 * Re-using an existing one.  Update timestamp lazily to
4692		 * avoid making the cacheline hot.  We want them to be
4693		 * reasonably up-to-date and significantly shorter than
4694		 * dirty_expire_interval as that's what expires the record.
4695		 * Use the shorter of 1s and dirty_expire_interval / 8.
4696		 */
4697		unsigned long update_intv =
4698			min_t(unsigned long, HZ,
4699			      msecs_to_jiffies(dirty_expire_interval * 10) / 8);
4700
4701		if (time_before64(frn->at, now - update_intv))
4702			frn->at = now;
4703	} else if (oldest >= 0) {
4704		/* replace the oldest free one */
4705		frn = &memcg->cgwb_frn[oldest];
4706		frn->bdi_id = wb->bdi->id;
4707		frn->memcg_id = wb->memcg_css->id;
4708		frn->at = now;
4709	}
4710}
4711
4712/* issue foreign writeback flushes for recorded foreign dirtying events */
4713void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
4714{
4715	struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
4716	unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10);
4717	u64 now = jiffies_64;
4718	int i;
4719
4720	for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
4721		struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i];
4722
4723		/*
4724		 * If the record is older than dirty_expire_interval,
4725		 * writeback on it has already started.  No need to kick it
4726		 * off again.  Also, don't start a new one if there's
4727		 * already one in flight.
4728		 */
4729		if (time_after64(frn->at, now - intv) &&
4730		    atomic_read(&frn->done.cnt) == 1) {
4731			frn->at = 0;
4732			trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id);
4733			cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id,
4734					       WB_REASON_FOREIGN_FLUSH,
4735					       &frn->done);
4736		}
4737	}
4738}
4739
4740#else	/* CONFIG_CGROUP_WRITEBACK */
4741
4742static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
4743{
4744	return 0;
4745}
4746
4747static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
4748{
4749}
4750
4751static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
4752{
4753}
4754
4755#endif	/* CONFIG_CGROUP_WRITEBACK */
4756
4757/*
4758 * DO NOT USE IN NEW FILES.
4759 *
4760 * "cgroup.event_control" implementation.
4761 *
4762 * This is way over-engineered.  It tries to support fully configurable
4763 * events for each user.  Such level of flexibility is completely
4764 * unnecessary especially in the light of the planned unified hierarchy.
4765 *
4766 * Please deprecate this and replace with something simpler if at all
4767 * possible.
4768 */
4769
4770/*
4771 * Unregister event and free resources.
4772 *
4773 * Gets called from workqueue.
4774 */
4775static void memcg_event_remove(struct work_struct *work)
4776{
4777	struct mem_cgroup_event *event =
4778		container_of(work, struct mem_cgroup_event, remove);
4779	struct mem_cgroup *memcg = event->memcg;
4780
4781	remove_wait_queue(event->wqh, &event->wait);
4782
4783	event->unregister_event(memcg, event->eventfd);
4784
4785	/* Notify userspace the event is going away. */
4786	eventfd_signal(event->eventfd, 1);
4787
4788	eventfd_ctx_put(event->eventfd);
4789	kfree(event);
4790	css_put(&memcg->css);
4791}
4792
4793/*
4794 * Gets called on EPOLLHUP on eventfd when user closes it.
4795 *
4796 * Called with wqh->lock held and interrupts disabled.
4797 */
4798static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode,
4799			    int sync, void *key)
4800{
4801	struct mem_cgroup_event *event =
4802		container_of(wait, struct mem_cgroup_event, wait);
4803	struct mem_cgroup *memcg = event->memcg;
4804	__poll_t flags = key_to_poll(key);
4805
4806	if (flags & EPOLLHUP) {
4807		/*
4808		 * If the event has been detached at cgroup removal, we
4809		 * can simply return knowing the other side will cleanup
4810		 * for us.
4811		 *
4812		 * We can't race against event freeing since the other
4813		 * side will require wqh->lock via remove_wait_queue(),
4814		 * which we hold.
4815		 */
4816		spin_lock(&memcg->event_list_lock);
4817		if (!list_empty(&event->list)) {
4818			list_del_init(&event->list);
4819			/*
4820			 * We are in atomic context, but cgroup_event_remove()
4821			 * may sleep, so we have to call it in workqueue.
4822			 */
4823			schedule_work(&event->remove);
4824		}
4825		spin_unlock(&memcg->event_list_lock);
4826	}
4827
4828	return 0;
4829}
4830
4831static void memcg_event_ptable_queue_proc(struct file *file,
4832		wait_queue_head_t *wqh, poll_table *pt)
4833{
4834	struct mem_cgroup_event *event =
4835		container_of(pt, struct mem_cgroup_event, pt);
4836
4837	event->wqh = wqh;
4838	add_wait_queue(wqh, &event->wait);
4839}
4840
4841/*
4842 * DO NOT USE IN NEW FILES.
4843 *
4844 * Parse input and register new cgroup event handler.
4845 *
4846 * Input must be in format '<event_fd> <control_fd> <args>'.
4847 * Interpretation of args is defined by control file implementation.
4848 */
4849static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
4850					 char *buf, size_t nbytes, loff_t off)
4851{
4852	struct cgroup_subsys_state *css = of_css(of);
4853	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4854	struct mem_cgroup_event *event;
4855	struct cgroup_subsys_state *cfile_css;
4856	unsigned int efd, cfd;
4857	struct fd efile;
4858	struct fd cfile;
4859	struct dentry *cdentry;
4860	const char *name;
4861	char *endp;
4862	int ret;
4863
4864	if (IS_ENABLED(CONFIG_PREEMPT_RT))
4865		return -EOPNOTSUPP;
4866
4867	buf = strstrip(buf);
4868
4869	efd = simple_strtoul(buf, &endp, 10);
4870	if (*endp != ' ')
4871		return -EINVAL;
4872	buf = endp + 1;
4873
4874	cfd = simple_strtoul(buf, &endp, 10);
4875	if ((*endp != ' ') && (*endp != '\0'))
4876		return -EINVAL;
4877	buf = endp + 1;
4878
4879	event = kzalloc(sizeof(*event), GFP_KERNEL);
4880	if (!event)
4881		return -ENOMEM;
4882
4883	event->memcg = memcg;
4884	INIT_LIST_HEAD(&event->list);
4885	init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
4886	init_waitqueue_func_entry(&event->wait, memcg_event_wake);
4887	INIT_WORK(&event->remove, memcg_event_remove);
4888
4889	efile = fdget(efd);
4890	if (!efile.file) {
4891		ret = -EBADF;
4892		goto out_kfree;
4893	}
4894
4895	event->eventfd = eventfd_ctx_fileget(efile.file);
4896	if (IS_ERR(event->eventfd)) {
4897		ret = PTR_ERR(event->eventfd);
4898		goto out_put_efile;
4899	}
4900
4901	cfile = fdget(cfd);
4902	if (!cfile.file) {
4903		ret = -EBADF;
4904		goto out_put_eventfd;
4905	}
4906
4907	/* the process need read permission on control file */
4908	/* AV: shouldn't we check that it's been opened for read instead? */
4909	ret = file_permission(cfile.file, MAY_READ);
4910	if (ret < 0)
4911		goto out_put_cfile;
4912
4913	/*
4914	 * The control file must be a regular cgroup1 file. As a regular cgroup
4915	 * file can't be renamed, it's safe to access its name afterwards.
4916	 */
4917	cdentry = cfile.file->f_path.dentry;
4918	if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) {
4919		ret = -EINVAL;
4920		goto out_put_cfile;
4921	}
4922
4923	/*
4924	 * Determine the event callbacks and set them in @event.  This used
4925	 * to be done via struct cftype but cgroup core no longer knows
4926	 * about these events.  The following is crude but the whole thing
4927	 * is for compatibility anyway.
4928	 *
4929	 * DO NOT ADD NEW FILES.
4930	 */
4931	name = cdentry->d_name.name;
4932
4933	if (!strcmp(name, "memory.usage_in_bytes")) {
4934		event->register_event = mem_cgroup_usage_register_event;
4935		event->unregister_event = mem_cgroup_usage_unregister_event;
4936	} else if (!strcmp(name, "memory.oom_control")) {
4937		event->register_event = mem_cgroup_oom_register_event;
4938		event->unregister_event = mem_cgroup_oom_unregister_event;
4939	} else if (!strcmp(name, "memory.pressure_level")) {
4940		event->register_event = vmpressure_register_event;
4941		event->unregister_event = vmpressure_unregister_event;
4942	} else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
4943		event->register_event = memsw_cgroup_usage_register_event;
4944		event->unregister_event = memsw_cgroup_usage_unregister_event;
4945	} else {
4946		ret = -EINVAL;
4947		goto out_put_cfile;
4948	}
4949
4950	/*
4951	 * Verify @cfile should belong to @css.  Also, remaining events are
4952	 * automatically removed on cgroup destruction but the removal is
4953	 * asynchronous, so take an extra ref on @css.
4954	 */
4955	cfile_css = css_tryget_online_from_dir(cdentry->d_parent,
4956					       &memory_cgrp_subsys);
4957	ret = -EINVAL;
4958	if (IS_ERR(cfile_css))
4959		goto out_put_cfile;
4960	if (cfile_css != css) {
4961		css_put(cfile_css);
4962		goto out_put_cfile;
4963	}
4964
4965	ret = event->register_event(memcg, event->eventfd, buf);
4966	if (ret)
4967		goto out_put_css;
4968
4969	vfs_poll(efile.file, &event->pt);
4970
4971	spin_lock_irq(&memcg->event_list_lock);
4972	list_add(&event->list, &memcg->event_list);
4973	spin_unlock_irq(&memcg->event_list_lock);
4974
4975	fdput(cfile);
4976	fdput(efile);
4977
4978	return nbytes;
4979
4980out_put_css:
4981	css_put(css);
4982out_put_cfile:
4983	fdput(cfile);
4984out_put_eventfd:
4985	eventfd_ctx_put(event->eventfd);
4986out_put_efile:
4987	fdput(efile);
4988out_kfree:
4989	kfree(event);
4990
4991	return ret;
4992}
4993
4994#if defined(CONFIG_MEMCG_KMEM) && (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG))
4995static int mem_cgroup_slab_show(struct seq_file *m, void *p)
4996{
4997	/*
4998	 * Deprecated.
4999	 * Please, take a look at tools/cgroup/memcg_slabinfo.py .
5000	 */
5001	return 0;
5002}
5003#endif
5004
5005static int memory_stat_show(struct seq_file *m, void *v);
5006
5007static struct cftype mem_cgroup_legacy_files[] = {
5008	{
5009		.name = "usage_in_bytes",
5010		.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
5011		.read_u64 = mem_cgroup_read_u64,
5012	},
5013	{
5014		.name = "max_usage_in_bytes",
5015		.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
5016		.write = mem_cgroup_reset,
5017		.read_u64 = mem_cgroup_read_u64,
5018	},
5019	{
5020		.name = "limit_in_bytes",
5021		.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
5022		.write = mem_cgroup_write,
5023		.read_u64 = mem_cgroup_read_u64,
5024	},
5025	{
5026		.name = "soft_limit_in_bytes",
5027		.private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
5028		.write = mem_cgroup_write,
5029		.read_u64 = mem_cgroup_read_u64,
5030	},
5031	{
5032		.name = "failcnt",
5033		.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
5034		.write = mem_cgroup_reset,
5035		.read_u64 = mem_cgroup_read_u64,
5036	},
5037	{
5038		.name = "stat",
5039		.seq_show = memory_stat_show,
5040	},
5041	{
5042		.name = "force_empty",
5043		.write = mem_cgroup_force_empty_write,
5044	},
5045	{
5046		.name = "use_hierarchy",
5047		.write_u64 = mem_cgroup_hierarchy_write,
5048		.read_u64 = mem_cgroup_hierarchy_read,
5049	},
5050	{
5051		.name = "cgroup.event_control",		/* XXX: for compat */
5052		.write = memcg_write_event_control,
5053		.flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE,
5054	},
5055	{
5056		.name = "swappiness",
5057		.read_u64 = mem_cgroup_swappiness_read,
5058		.write_u64 = mem_cgroup_swappiness_write,
5059	},
5060	{
5061		.name = "move_charge_at_immigrate",
5062		.read_u64 = mem_cgroup_move_charge_read,
5063		.write_u64 = mem_cgroup_move_charge_write,
5064	},
5065	{
5066		.name = "oom_control",
5067		.seq_show = mem_cgroup_oom_control_read,
5068		.write_u64 = mem_cgroup_oom_control_write,
5069	},
5070	{
5071		.name = "pressure_level",
5072		.seq_show = mem_cgroup_dummy_seq_show,
5073	},
5074#ifdef CONFIG_NUMA
5075	{
5076		.name = "numa_stat",
5077		.seq_show = memcg_numa_stat_show,
5078	},
5079#endif
5080	{
5081		.name = "kmem.usage_in_bytes",
5082		.private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
5083		.read_u64 = mem_cgroup_read_u64,
5084	},
5085	{
5086		.name = "kmem.failcnt",
5087		.private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
5088		.write = mem_cgroup_reset,
5089		.read_u64 = mem_cgroup_read_u64,
5090	},
5091	{
5092		.name = "kmem.max_usage_in_bytes",
5093		.private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
5094		.write = mem_cgroup_reset,
5095		.read_u64 = mem_cgroup_read_u64,
5096	},
5097#if defined(CONFIG_MEMCG_KMEM) && \
5098	(defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG))
5099	{
5100		.name = "kmem.slabinfo",
5101		.seq_show = mem_cgroup_slab_show,
5102	},
5103#endif
5104	{
5105		.name = "kmem.tcp.limit_in_bytes",
5106		.private = MEMFILE_PRIVATE(_TCP, RES_LIMIT),
5107		.write = mem_cgroup_write,
5108		.read_u64 = mem_cgroup_read_u64,
5109	},
5110	{
5111		.name = "kmem.tcp.usage_in_bytes",
5112		.private = MEMFILE_PRIVATE(_TCP, RES_USAGE),
5113		.read_u64 = mem_cgroup_read_u64,
5114	},
5115	{
5116		.name = "kmem.tcp.failcnt",
5117		.private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT),
5118		.write = mem_cgroup_reset,
5119		.read_u64 = mem_cgroup_read_u64,
5120	},
5121	{
5122		.name = "kmem.tcp.max_usage_in_bytes",
5123		.private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE),
5124		.write = mem_cgroup_reset,
5125		.read_u64 = mem_cgroup_read_u64,
5126	},
5127	{ },	/* terminate */
5128};
5129
5130/*
5131 * Private memory cgroup IDR
5132 *
5133 * Swap-out records and page cache shadow entries need to store memcg
5134 * references in constrained space, so we maintain an ID space that is
5135 * limited to 16 bit (MEM_CGROUP_ID_MAX), limiting the total number of
5136 * memory-controlled cgroups to 64k.
5137 *
5138 * However, there usually are many references to the offline CSS after
5139 * the cgroup has been destroyed, such as page cache or reclaimable
5140 * slab objects, that don't need to hang on to the ID. We want to keep
5141 * those dead CSS from occupying IDs, or we might quickly exhaust the
5142 * relatively small ID space and prevent the creation of new cgroups
5143 * even when there are much fewer than 64k cgroups - possibly none.
5144 *
5145 * Maintain a private 16-bit ID space for memcg, and allow the ID to
5146 * be freed and recycled when it's no longer needed, which is usually
5147 * when the CSS is offlined.
5148 *
5149 * The only exception to that are records of swapped out tmpfs/shmem
5150 * pages that need to be attributed to live ancestors on swapin. But
5151 * those references are manageable from userspace.
5152 */
5153
5154#define MEM_CGROUP_ID_MAX	((1UL << MEM_CGROUP_ID_SHIFT) - 1)
5155static DEFINE_IDR(mem_cgroup_idr);
5156
5157static void mem_cgroup_id_remove(struct mem_cgroup *memcg)
5158{
5159	if (memcg->id.id > 0) {
5160		idr_remove(&mem_cgroup_idr, memcg->id.id);
5161		memcg->id.id = 0;
5162	}
5163}
5164
5165static void __maybe_unused mem_cgroup_id_get_many(struct mem_cgroup *memcg,
5166						  unsigned int n)
5167{
5168	refcount_add(n, &memcg->id.ref);
5169}
5170
5171static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
5172{
5173	if (refcount_sub_and_test(n, &memcg->id.ref)) {
5174		mem_cgroup_id_remove(memcg);
5175
5176		/* Memcg ID pins CSS */
5177		css_put(&memcg->css);
5178	}
5179}
5180
5181static inline void mem_cgroup_id_put(struct mem_cgroup *memcg)
5182{
5183	mem_cgroup_id_put_many(memcg, 1);
5184}
5185
5186/**
5187 * mem_cgroup_from_id - look up a memcg from a memcg id
5188 * @id: the memcg id to look up
5189 *
5190 * Caller must hold rcu_read_lock().
5191 */
5192struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
5193{
5194	WARN_ON_ONCE(!rcu_read_lock_held());
5195	return idr_find(&mem_cgroup_idr, id);
5196}
5197
5198#ifdef CONFIG_SHRINKER_DEBUG
5199struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino)
5200{
5201	struct cgroup *cgrp;
5202	struct cgroup_subsys_state *css;
5203	struct mem_cgroup *memcg;
5204
5205	cgrp = cgroup_get_from_id(ino);
5206	if (IS_ERR(cgrp))
5207		return ERR_CAST(cgrp);
5208
5209	css = cgroup_get_e_css(cgrp, &memory_cgrp_subsys);
5210	if (css)
5211		memcg = container_of(css, struct mem_cgroup, css);
5212	else
5213		memcg = ERR_PTR(-ENOENT);
5214
5215	cgroup_put(cgrp);
5216
5217	return memcg;
5218}
5219#endif
5220
5221static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
5222{
5223	struct mem_cgroup_per_node *pn;
5224
5225	pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, node);
5226	if (!pn)
5227		return 1;
5228
5229	pn->lruvec_stats_percpu = alloc_percpu_gfp(struct lruvec_stats_percpu,
5230						   GFP_KERNEL_ACCOUNT);
5231	if (!pn->lruvec_stats_percpu) {
5232		kfree(pn);
5233		return 1;
5234	}
5235
5236	lruvec_init(&pn->lruvec);
5237	pn->memcg = memcg;
5238
5239	memcg->nodeinfo[node] = pn;
5240	return 0;
5241}
5242
5243static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
5244{
5245	struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
5246
5247	if (!pn)
5248		return;
5249
5250	free_percpu(pn->lruvec_stats_percpu);
5251	kfree(pn);
5252}
5253
5254static void __mem_cgroup_free(struct mem_cgroup *memcg)
5255{
5256	int node;
5257
5258	for_each_node(node)
5259		free_mem_cgroup_per_node_info(memcg, node);
5260	kfree(memcg->vmstats);
5261	free_percpu(memcg->vmstats_percpu);
5262	kfree(memcg);
5263}
5264
5265static void mem_cgroup_free(struct mem_cgroup *memcg)
5266{
5267	lru_gen_exit_memcg(memcg);
5268	memcg_wb_domain_exit(memcg);
5269	__mem_cgroup_free(memcg);
5270}
5271
5272static struct mem_cgroup *mem_cgroup_alloc(void)
5273{
5274	struct mem_cgroup *memcg;
5275	int node;
5276	int __maybe_unused i;
5277	long error = -ENOMEM;
5278
5279	memcg = kzalloc(struct_size(memcg, nodeinfo, nr_node_ids), GFP_KERNEL);
5280	if (!memcg)
5281		return ERR_PTR(error);
5282
5283	memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL,
5284				 1, MEM_CGROUP_ID_MAX + 1, GFP_KERNEL);
5285	if (memcg->id.id < 0) {
5286		error = memcg->id.id;
5287		goto fail;
5288	}
5289
5290	memcg->vmstats = kzalloc(sizeof(struct memcg_vmstats), GFP_KERNEL);
5291	if (!memcg->vmstats)
5292		goto fail;
5293
5294	memcg->vmstats_percpu = alloc_percpu_gfp(struct memcg_vmstats_percpu,
5295						 GFP_KERNEL_ACCOUNT);
5296	if (!memcg->vmstats_percpu)
5297		goto fail;
5298
5299	for_each_node(node)
5300		if (alloc_mem_cgroup_per_node_info(memcg, node))
5301			goto fail;
5302
5303	if (memcg_wb_domain_init(memcg, GFP_KERNEL))
5304		goto fail;
5305
5306	INIT_WORK(&memcg->high_work, high_work_func);
5307	INIT_LIST_HEAD(&memcg->oom_notify);
5308	mutex_init(&memcg->thresholds_lock);
5309	spin_lock_init(&memcg->move_lock);
5310	vmpressure_init(&memcg->vmpressure);
5311	INIT_LIST_HEAD(&memcg->event_list);
5312	spin_lock_init(&memcg->event_list_lock);
5313	memcg->socket_pressure = jiffies;
5314#ifdef CONFIG_MEMCG_KMEM
5315	memcg->kmemcg_id = -1;
5316	INIT_LIST_HEAD(&memcg->objcg_list);
5317#endif
5318#ifdef CONFIG_CGROUP_WRITEBACK
5319	INIT_LIST_HEAD(&memcg->cgwb_list);
5320	for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
5321		memcg->cgwb_frn[i].done =
5322			__WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq);
5323#endif
5324#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5325	spin_lock_init(&memcg->deferred_split_queue.split_queue_lock);
5326	INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue);
5327	memcg->deferred_split_queue.split_queue_len = 0;
5328#endif
5329	lru_gen_init_memcg(memcg);
5330	return memcg;
5331fail:
5332	mem_cgroup_id_remove(memcg);
5333	__mem_cgroup_free(memcg);
5334	return ERR_PTR(error);
5335}
5336
5337static struct cgroup_subsys_state * __ref
5338mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
5339{
5340	struct mem_cgroup *parent = mem_cgroup_from_css(parent_css);
5341	struct mem_cgroup *memcg, *old_memcg;
5342
5343	old_memcg = set_active_memcg(parent);
5344	memcg = mem_cgroup_alloc();
5345	set_active_memcg(old_memcg);
5346	if (IS_ERR(memcg))
5347		return ERR_CAST(memcg);
5348
5349	page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
5350	WRITE_ONCE(memcg->soft_limit, PAGE_COUNTER_MAX);
5351#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
5352	memcg->zswap_max = PAGE_COUNTER_MAX;
5353#endif
5354	page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
5355	if (parent) {
5356		WRITE_ONCE(memcg->swappiness, mem_cgroup_swappiness(parent));
5357		WRITE_ONCE(memcg->oom_kill_disable, READ_ONCE(parent->oom_kill_disable));
5358
5359		page_counter_init(&memcg->memory, &parent->memory);
5360		page_counter_init(&memcg->swap, &parent->swap);
5361		page_counter_init(&memcg->kmem, &parent->kmem);
5362		page_counter_init(&memcg->tcpmem, &parent->tcpmem);
5363	} else {
5364		init_memcg_events();
5365		page_counter_init(&memcg->memory, NULL);
5366		page_counter_init(&memcg->swap, NULL);
5367		page_counter_init(&memcg->kmem, NULL);
5368		page_counter_init(&memcg->tcpmem, NULL);
5369
5370		root_mem_cgroup = memcg;
5371		return &memcg->css;
5372	}
5373
5374	if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
5375		static_branch_inc(&memcg_sockets_enabled_key);
5376
5377#if defined(CONFIG_MEMCG_KMEM)
5378	if (!cgroup_memory_nobpf)
5379		static_branch_inc(&memcg_bpf_enabled_key);
5380#endif
5381
5382	return &memcg->css;
5383}
5384
5385static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
5386{
5387	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5388
5389	if (memcg_online_kmem(memcg))
5390		goto remove_id;
5391
5392	/*
5393	 * A memcg must be visible for expand_shrinker_info()
5394	 * by the time the maps are allocated. So, we allocate maps
5395	 * here, when for_each_mem_cgroup() can't skip it.
5396	 */
5397	if (alloc_shrinker_info(memcg))
5398		goto offline_kmem;
5399
5400	if (unlikely(mem_cgroup_is_root(memcg)))
5401		queue_delayed_work(system_unbound_wq, &stats_flush_dwork,
5402				   FLUSH_TIME);
5403	lru_gen_online_memcg(memcg);
5404
5405	/* Online state pins memcg ID, memcg ID pins CSS */
5406	refcount_set(&memcg->id.ref, 1);
5407	css_get(css);
5408
5409	/*
5410	 * Ensure mem_cgroup_from_id() works once we're fully online.
5411	 *
5412	 * We could do this earlier and require callers to filter with
5413	 * css_tryget_online(). But right now there are no users that
5414	 * need earlier access, and the workingset code relies on the
5415	 * cgroup tree linkage (mem_cgroup_get_nr_swap_pages()). So
5416	 * publish it here at the end of onlining. This matches the
5417	 * regular ID destruction during offlining.
5418	 */
5419	idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
5420
5421	return 0;
5422offline_kmem:
5423	memcg_offline_kmem(memcg);
5424remove_id:
5425	mem_cgroup_id_remove(memcg);
5426	return -ENOMEM;
5427}
5428
5429static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
5430{
5431	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5432	struct mem_cgroup_event *event, *tmp;
5433
5434	/*
5435	 * Unregister events and notify userspace.
5436	 * Notify userspace about cgroup removing only after rmdir of cgroup
5437	 * directory to avoid race between userspace and kernelspace.
5438	 */
5439	spin_lock_irq(&memcg->event_list_lock);
5440	list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
5441		list_del_init(&event->list);
5442		schedule_work(&event->remove);
5443	}
5444	spin_unlock_irq(&memcg->event_list_lock);
5445
5446	page_counter_set_min(&memcg->memory, 0);
5447	page_counter_set_low(&memcg->memory, 0);
5448
5449	memcg_offline_kmem(memcg);
5450	reparent_shrinker_deferred(memcg);
5451	wb_memcg_offline(memcg);
5452	lru_gen_offline_memcg(memcg);
5453
5454	drain_all_stock(memcg);
5455
5456	mem_cgroup_id_put(memcg);
5457}
5458
5459static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
5460{
5461	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5462
5463	invalidate_reclaim_iterators(memcg);
5464	lru_gen_release_memcg(memcg);
5465}
5466
5467static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
5468{
5469	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5470	int __maybe_unused i;
5471
5472#ifdef CONFIG_CGROUP_WRITEBACK
5473	for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
5474		wb_wait_for_completion(&memcg->cgwb_frn[i].done);
5475#endif
5476	if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
5477		static_branch_dec(&memcg_sockets_enabled_key);
5478
5479	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active)
5480		static_branch_dec(&memcg_sockets_enabled_key);
5481
5482#if defined(CONFIG_MEMCG_KMEM)
5483	if (!cgroup_memory_nobpf)
5484		static_branch_dec(&memcg_bpf_enabled_key);
5485#endif
5486
5487	vmpressure_cleanup(&memcg->vmpressure);
5488	cancel_work_sync(&memcg->high_work);
5489	mem_cgroup_remove_from_trees(memcg);
5490	free_shrinker_info(memcg);
5491	mem_cgroup_free(memcg);
5492}
5493
5494/**
5495 * mem_cgroup_css_reset - reset the states of a mem_cgroup
5496 * @css: the target css
5497 *
5498 * Reset the states of the mem_cgroup associated with @css.  This is
5499 * invoked when the userland requests disabling on the default hierarchy
5500 * but the memcg is pinned through dependency.  The memcg should stop
5501 * applying policies and should revert to the vanilla state as it may be
5502 * made visible again.
5503 *
5504 * The current implementation only resets the essential configurations.
5505 * This needs to be expanded to cover all the visible parts.
5506 */
5507static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
5508{
5509	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5510
5511	page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX);
5512	page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX);
5513	page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX);
5514	page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
5515	page_counter_set_min(&memcg->memory, 0);
5516	page_counter_set_low(&memcg->memory, 0);
5517	page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
5518	WRITE_ONCE(memcg->soft_limit, PAGE_COUNTER_MAX);
5519	page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
5520	memcg_wb_domain_size_changed(memcg);
5521}
5522
5523static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
5524{
5525	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5526	struct mem_cgroup *parent = parent_mem_cgroup(memcg);
5527	struct memcg_vmstats_percpu *statc;
5528	long delta, delta_cpu, v;
5529	int i, nid;
5530
5531	statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
5532
5533	for (i = 0; i < MEMCG_NR_STAT; i++) {
5534		/*
5535		 * Collect the aggregated propagation counts of groups
5536		 * below us. We're in a per-cpu loop here and this is
5537		 * a global counter, so the first cycle will get them.
5538		 */
5539		delta = memcg->vmstats->state_pending[i];
5540		if (delta)
5541			memcg->vmstats->state_pending[i] = 0;
5542
5543		/* Add CPU changes on this level since the last flush */
5544		delta_cpu = 0;
5545		v = READ_ONCE(statc->state[i]);
5546		if (v != statc->state_prev[i]) {
5547			delta_cpu = v - statc->state_prev[i];
5548			delta += delta_cpu;
5549			statc->state_prev[i] = v;
5550		}
5551
5552		/* Aggregate counts on this level and propagate upwards */
5553		if (delta_cpu)
5554			memcg->vmstats->state_local[i] += delta_cpu;
5555
5556		if (delta) {
5557			memcg->vmstats->state[i] += delta;
5558			if (parent)
5559				parent->vmstats->state_pending[i] += delta;
5560		}
5561	}
5562
5563	for (i = 0; i < NR_MEMCG_EVENTS; i++) {
5564		delta = memcg->vmstats->events_pending[i];
5565		if (delta)
5566			memcg->vmstats->events_pending[i] = 0;
5567
5568		delta_cpu = 0;
5569		v = READ_ONCE(statc->events[i]);
5570		if (v != statc->events_prev[i]) {
5571			delta_cpu = v - statc->events_prev[i];
5572			delta += delta_cpu;
5573			statc->events_prev[i] = v;
5574		}
5575
5576		if (delta_cpu)
5577			memcg->vmstats->events_local[i] += delta_cpu;
5578
5579		if (delta) {
5580			memcg->vmstats->events[i] += delta;
5581			if (parent)
5582				parent->vmstats->events_pending[i] += delta;
5583		}
5584	}
5585
5586	for_each_node_state(nid, N_MEMORY) {
5587		struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid];
5588		struct mem_cgroup_per_node *ppn = NULL;
5589		struct lruvec_stats_percpu *lstatc;
5590
5591		if (parent)
5592			ppn = parent->nodeinfo[nid];
5593
5594		lstatc = per_cpu_ptr(pn->lruvec_stats_percpu, cpu);
5595
5596		for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
5597			delta = pn->lruvec_stats.state_pending[i];
5598			if (delta)
5599				pn->lruvec_stats.state_pending[i] = 0;
5600
5601			delta_cpu = 0;
5602			v = READ_ONCE(lstatc->state[i]);
5603			if (v != lstatc->state_prev[i]) {
5604				delta_cpu = v - lstatc->state_prev[i];
5605				delta += delta_cpu;
5606				lstatc->state_prev[i] = v;
5607			}
5608
5609			if (delta_cpu)
5610				pn->lruvec_stats.state_local[i] += delta_cpu;
5611
5612			if (delta) {
5613				pn->lruvec_stats.state[i] += delta;
5614				if (ppn)
5615					ppn->lruvec_stats.state_pending[i] += delta;
5616			}
5617		}
5618	}
5619}
5620
5621#ifdef CONFIG_MMU
5622/* Handlers for move charge at task migration. */
5623static int mem_cgroup_do_precharge(unsigned long count)
5624{
5625	int ret;
5626
5627	/* Try a single bulk charge without reclaim first, kswapd may wake */
5628	ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count);
5629	if (!ret) {
5630		mc.precharge += count;
5631		return ret;
5632	}
5633
5634	/* Try charges one by one with reclaim, but do not retry */
5635	while (count--) {
5636		ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, 1);
5637		if (ret)
5638			return ret;
5639		mc.precharge++;
5640		cond_resched();
5641	}
5642	return 0;
5643}
5644
5645union mc_target {
5646	struct page	*page;
5647	swp_entry_t	ent;
5648};
5649
5650enum mc_target_type {
5651	MC_TARGET_NONE = 0,
5652	MC_TARGET_PAGE,
5653	MC_TARGET_SWAP,
5654	MC_TARGET_DEVICE,
5655};
5656
5657static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
5658						unsigned long addr, pte_t ptent)
5659{
5660	struct page *page = vm_normal_page(vma, addr, ptent);
5661
5662	if (!page)
5663		return NULL;
5664	if (PageAnon(page)) {
5665		if (!(mc.flags & MOVE_ANON))
5666			return NULL;
5667	} else {
5668		if (!(mc.flags & MOVE_FILE))
5669			return NULL;
5670	}
5671	get_page(page);
5672
5673	return page;
5674}
5675
5676#if defined(CONFIG_SWAP) || defined(CONFIG_DEVICE_PRIVATE)
5677static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
5678			pte_t ptent, swp_entry_t *entry)
5679{
5680	struct page *page = NULL;
5681	swp_entry_t ent = pte_to_swp_entry(ptent);
5682
5683	if (!(mc.flags & MOVE_ANON))
5684		return NULL;
5685
5686	/*
5687	 * Handle device private pages that are not accessible by the CPU, but
5688	 * stored as special swap entries in the page table.
5689	 */
5690	if (is_device_private_entry(ent)) {
5691		page = pfn_swap_entry_to_page(ent);
5692		if (!get_page_unless_zero(page))
5693			return NULL;
5694		return page;
5695	}
5696
5697	if (non_swap_entry(ent))
5698		return NULL;
5699
5700	/*
5701	 * Because swap_cache_get_folio() updates some statistics counter,
5702	 * we call find_get_page() with swapper_space directly.
5703	 */
5704	page = find_get_page(swap_address_space(ent), swp_offset(ent));
5705	entry->val = ent.val;
5706
5707	return page;
5708}
5709#else
5710static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
5711			pte_t ptent, swp_entry_t *entry)
5712{
5713	return NULL;
5714}
5715#endif
5716
5717static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
5718			unsigned long addr, pte_t ptent)
5719{
5720	unsigned long index;
5721	struct folio *folio;
5722
5723	if (!vma->vm_file) /* anonymous vma */
5724		return NULL;
5725	if (!(mc.flags & MOVE_FILE))
5726		return NULL;
5727
5728	/* folio is moved even if it's not RSS of this task(page-faulted). */
5729	/* shmem/tmpfs may report page out on swap: account for that too. */
5730	index = linear_page_index(vma, addr);
5731	folio = filemap_get_incore_folio(vma->vm_file->f_mapping, index);
5732	if (IS_ERR(folio))
5733		return NULL;
5734	return folio_file_page(folio, index);
5735}
5736
5737/**
5738 * mem_cgroup_move_account - move account of the page
5739 * @page: the page
5740 * @compound: charge the page as compound or small page
5741 * @from: mem_cgroup which the page is moved from.
5742 * @to:	mem_cgroup which the page is moved to. @from != @to.
5743 *
5744 * The page must be locked and not on the LRU.
5745 *
5746 * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
5747 * from old cgroup.
5748 */
5749static int mem_cgroup_move_account(struct page *page,
5750				   bool compound,
5751				   struct mem_cgroup *from,
5752				   struct mem_cgroup *to)
5753{
5754	struct folio *folio = page_folio(page);
5755	struct lruvec *from_vec, *to_vec;
5756	struct pglist_data *pgdat;
5757	unsigned int nr_pages = compound ? folio_nr_pages(folio) : 1;
5758	int nid, ret;
5759
5760	VM_BUG_ON(from == to);
5761	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
5762	VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
5763	VM_BUG_ON(compound && !folio_test_large(folio));
5764
5765	ret = -EINVAL;
5766	if (folio_memcg(folio) != from)
5767		goto out;
5768
5769	pgdat = folio_pgdat(folio);
5770	from_vec = mem_cgroup_lruvec(from, pgdat);
5771	to_vec = mem_cgroup_lruvec(to, pgdat);
5772
5773	folio_memcg_lock(folio);
5774
5775	if (folio_test_anon(folio)) {
5776		if (folio_mapped(folio)) {
5777			__mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages);
5778			__mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages);
5779			if (folio_test_pmd_mappable(folio)) {
5780				__mod_lruvec_state(from_vec, NR_ANON_THPS,
5781						   -nr_pages);
5782				__mod_lruvec_state(to_vec, NR_ANON_THPS,
5783						   nr_pages);
5784			}
5785		}
5786	} else {
5787		__mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages);
5788		__mod_lruvec_state(to_vec, NR_FILE_PAGES, nr_pages);
5789
5790		if (folio_test_swapbacked(folio)) {
5791			__mod_lruvec_state(from_vec, NR_SHMEM, -nr_pages);
5792			__mod_lruvec_state(to_vec, NR_SHMEM, nr_pages);
5793		}
5794
5795		if (folio_mapped(folio)) {
5796			__mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages);
5797			__mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages);
5798		}
5799
5800		if (folio_test_dirty(folio)) {
5801			struct address_space *mapping = folio_mapping(folio);
5802
5803			if (mapping_can_writeback(mapping)) {
5804				__mod_lruvec_state(from_vec, NR_FILE_DIRTY,
5805						   -nr_pages);
5806				__mod_lruvec_state(to_vec, NR_FILE_DIRTY,
5807						   nr_pages);
5808			}
5809		}
5810	}
5811
5812#ifdef CONFIG_SWAP
5813	if (folio_test_swapcache(folio)) {
5814		__mod_lruvec_state(from_vec, NR_SWAPCACHE, -nr_pages);
5815		__mod_lruvec_state(to_vec, NR_SWAPCACHE, nr_pages);
5816	}
5817#endif
5818	if (folio_test_writeback(folio)) {
5819		__mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages);
5820		__mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages);
5821	}
5822
5823	/*
5824	 * All state has been migrated, let's switch to the new memcg.
5825	 *
5826	 * It is safe to change page's memcg here because the page
5827	 * is referenced, charged, isolated, and locked: we can't race
5828	 * with (un)charging, migration, LRU putback, or anything else
5829	 * that would rely on a stable page's memory cgroup.
5830	 *
5831	 * Note that folio_memcg_lock is a memcg lock, not a page lock,
5832	 * to save space. As soon as we switch page's memory cgroup to a
5833	 * new memcg that isn't locked, the above state can change
5834	 * concurrently again. Make sure we're truly done with it.
5835	 */
5836	smp_mb();
5837
5838	css_get(&to->css);
5839	css_put(&from->css);
5840
5841	folio->memcg_data = (unsigned long)to;
5842
5843	__folio_memcg_unlock(from);
5844
5845	ret = 0;
5846	nid = folio_nid(folio);
5847
5848	local_irq_disable();
5849	mem_cgroup_charge_statistics(to, nr_pages);
5850	memcg_check_events(to, nid);
5851	mem_cgroup_charge_statistics(from, -nr_pages);
5852	memcg_check_events(from, nid);
5853	local_irq_enable();
5854out:
5855	return ret;
5856}
5857
5858/**
5859 * get_mctgt_type - get target type of moving charge
5860 * @vma: the vma the pte to be checked belongs
5861 * @addr: the address corresponding to the pte to be checked
5862 * @ptent: the pte to be checked
5863 * @target: the pointer the target page or swap ent will be stored(can be NULL)
5864 *
5865 * Context: Called with pte lock held.
5866 * Return:
5867 * * MC_TARGET_NONE - If the pte is not a target for move charge.
5868 * * MC_TARGET_PAGE - If the page corresponding to this pte is a target for
5869 *   move charge. If @target is not NULL, the page is stored in target->page
5870 *   with extra refcnt taken (Caller should release it).
5871 * * MC_TARGET_SWAP - If the swap entry corresponding to this pte is a
5872 *   target for charge migration.  If @target is not NULL, the entry is
5873 *   stored in target->ent.
5874 * * MC_TARGET_DEVICE - Like MC_TARGET_PAGE but page is device memory and
5875 *   thus not on the lru.  For now such page is charged like a regular page
5876 *   would be as it is just special memory taking the place of a regular page.
5877 *   See Documentations/vm/hmm.txt and include/linux/hmm.h
5878 */
5879static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
5880		unsigned long addr, pte_t ptent, union mc_target *target)
5881{
5882	struct page *page = NULL;
5883	enum mc_target_type ret = MC_TARGET_NONE;
5884	swp_entry_t ent = { .val = 0 };
5885
5886	if (pte_present(ptent))
5887		page = mc_handle_present_pte(vma, addr, ptent);
5888	else if (pte_none_mostly(ptent))
5889		/*
5890		 * PTE markers should be treated as a none pte here, separated
5891		 * from other swap handling below.
5892		 */
5893		page = mc_handle_file_pte(vma, addr, ptent);
5894	else if (is_swap_pte(ptent))
5895		page = mc_handle_swap_pte(vma, ptent, &ent);
5896
5897	if (target && page) {
5898		if (!trylock_page(page)) {
5899			put_page(page);
5900			return ret;
5901		}
5902		/*
5903		 * page_mapped() must be stable during the move. This
5904		 * pte is locked, so if it's present, the page cannot
5905		 * become unmapped. If it isn't, we have only partial
5906		 * control over the mapped state: the page lock will
5907		 * prevent new faults against pagecache and swapcache,
5908		 * so an unmapped page cannot become mapped. However,
5909		 * if the page is already mapped elsewhere, it can
5910		 * unmap, and there is nothing we can do about it.
5911		 * Alas, skip moving the page in this case.
5912		 */
5913		if (!pte_present(ptent) && page_mapped(page)) {
5914			unlock_page(page);
5915			put_page(page);
5916			return ret;
5917		}
5918	}
5919
5920	if (!page && !ent.val)
5921		return ret;
5922	if (page) {
5923		/*
5924		 * Do only loose check w/o serialization.
5925		 * mem_cgroup_move_account() checks the page is valid or
5926		 * not under LRU exclusion.
5927		 */
5928		if (page_memcg(page) == mc.from) {
5929			ret = MC_TARGET_PAGE;
5930			if (is_device_private_page(page) ||
5931			    is_device_coherent_page(page))
5932				ret = MC_TARGET_DEVICE;
5933			if (target)
5934				target->page = page;
5935		}
5936		if (!ret || !target) {
5937			if (target)
5938				unlock_page(page);
5939			put_page(page);
5940		}
5941	}
5942	/*
5943	 * There is a swap entry and a page doesn't exist or isn't charged.
5944	 * But we cannot move a tail-page in a THP.
5945	 */
5946	if (ent.val && !ret && (!page || !PageTransCompound(page)) &&
5947	    mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
5948		ret = MC_TARGET_SWAP;
5949		if (target)
5950			target->ent = ent;
5951	}
5952	return ret;
5953}
5954
5955#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5956/*
5957 * We don't consider PMD mapped swapping or file mapped pages because THP does
5958 * not support them for now.
5959 * Caller should make sure that pmd_trans_huge(pmd) is true.
5960 */
5961static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
5962		unsigned long addr, pmd_t pmd, union mc_target *target)
5963{
5964	struct page *page = NULL;
5965	enum mc_target_type ret = MC_TARGET_NONE;
5966
5967	if (unlikely(is_swap_pmd(pmd))) {
5968		VM_BUG_ON(thp_migration_supported() &&
5969				  !is_pmd_migration_entry(pmd));
5970		return ret;
5971	}
5972	page = pmd_page(pmd);
5973	VM_BUG_ON_PAGE(!page || !PageHead(page), page);
5974	if (!(mc.flags & MOVE_ANON))
5975		return ret;
5976	if (page_memcg(page) == mc.from) {
5977		ret = MC_TARGET_PAGE;
5978		if (target) {
5979			get_page(page);
5980			if (!trylock_page(page)) {
5981				put_page(page);
5982				return MC_TARGET_NONE;
5983			}
5984			target->page = page;
5985		}
5986	}
5987	return ret;
5988}
5989#else
5990static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
5991		unsigned long addr, pmd_t pmd, union mc_target *target)
5992{
5993	return MC_TARGET_NONE;
5994}
5995#endif
5996
5997static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
5998					unsigned long addr, unsigned long end,
5999					struct mm_walk *walk)
6000{
6001	struct vm_area_struct *vma = walk->vma;
6002	pte_t *pte;
6003	spinlock_t *ptl;
6004
6005	ptl = pmd_trans_huge_lock(pmd, vma);
6006	if (ptl) {
6007		/*
6008		 * Note their can not be MC_TARGET_DEVICE for now as we do not
6009		 * support transparent huge page with MEMORY_DEVICE_PRIVATE but
6010		 * this might change.
6011		 */
6012		if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
6013			mc.precharge += HPAGE_PMD_NR;
6014		spin_unlock(ptl);
6015		return 0;
6016	}
6017
6018	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
6019	if (!pte)
6020		return 0;
6021	for (; addr != end; pte++, addr += PAGE_SIZE)
6022		if (get_mctgt_type(vma, addr, ptep_get(pte), NULL))
6023			mc.precharge++;	/* increment precharge temporarily */
6024	pte_unmap_unlock(pte - 1, ptl);
6025	cond_resched();
6026
6027	return 0;
6028}
6029
6030static const struct mm_walk_ops precharge_walk_ops = {
6031	.pmd_entry	= mem_cgroup_count_precharge_pte_range,
6032	.walk_lock	= PGWALK_RDLOCK,
6033};
6034
6035static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
6036{
6037	unsigned long precharge;
6038
6039	mmap_read_lock(mm);
6040	walk_page_range(mm, 0, ULONG_MAX, &precharge_walk_ops, NULL);
6041	mmap_read_unlock(mm);
6042
6043	precharge = mc.precharge;
6044	mc.precharge = 0;
6045
6046	return precharge;
6047}
6048
6049static int mem_cgroup_precharge_mc(struct mm_struct *mm)
6050{
6051	unsigned long precharge = mem_cgroup_count_precharge(mm);
6052
6053	VM_BUG_ON(mc.moving_task);
6054	mc.moving_task = current;
6055	return mem_cgroup_do_precharge(precharge);
6056}
6057
6058/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
6059static void __mem_cgroup_clear_mc(void)
6060{
6061	struct mem_cgroup *from = mc.from;
6062	struct mem_cgroup *to = mc.to;
6063
6064	/* we must uncharge all the leftover precharges from mc.to */
6065	if (mc.precharge) {
6066		cancel_charge(mc.to, mc.precharge);
6067		mc.precharge = 0;
6068	}
6069	/*
6070	 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
6071	 * we must uncharge here.
6072	 */
6073	if (mc.moved_charge) {
6074		cancel_charge(mc.from, mc.moved_charge);
6075		mc.moved_charge = 0;
6076	}
6077	/* we must fixup refcnts and charges */
6078	if (mc.moved_swap) {
6079		/* uncharge swap account from the old cgroup */
6080		if (!mem_cgroup_is_root(mc.from))
6081			page_counter_uncharge(&mc.from->memsw, mc.moved_swap);
6082
6083		mem_cgroup_id_put_many(mc.from, mc.moved_swap);
6084
6085		/*
6086		 * we charged both to->memory and to->memsw, so we
6087		 * should uncharge to->memory.
6088		 */
6089		if (!mem_cgroup_is_root(mc.to))
6090			page_counter_uncharge(&mc.to->memory, mc.moved_swap);
6091
6092		mc.moved_swap = 0;
6093	}
6094	memcg_oom_recover(from);
6095	memcg_oom_recover(to);
6096	wake_up_all(&mc.waitq);
6097}
6098
6099static void mem_cgroup_clear_mc(void)
6100{
6101	struct mm_struct *mm = mc.mm;
6102
6103	/*
6104	 * we must clear moving_task before waking up waiters at the end of
6105	 * task migration.
6106	 */
6107	mc.moving_task = NULL;
6108	__mem_cgroup_clear_mc();
6109	spin_lock(&mc.lock);
6110	mc.from = NULL;
6111	mc.to = NULL;
6112	mc.mm = NULL;
6113	spin_unlock(&mc.lock);
6114
6115	mmput(mm);
6116}
6117
6118static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
6119{
6120	struct cgroup_subsys_state *css;
6121	struct mem_cgroup *memcg = NULL; /* unneeded init to make gcc happy */
6122	struct mem_cgroup *from;
6123	struct task_struct *leader, *p;
6124	struct mm_struct *mm;
6125	unsigned long move_flags;
6126	int ret = 0;
6127
6128	/* charge immigration isn't supported on the default hierarchy */
6129	if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
6130		return 0;
6131
6132	/*
6133	 * Multi-process migrations only happen on the default hierarchy
6134	 * where charge immigration is not used.  Perform charge
6135	 * immigration if @tset contains a leader and whine if there are
6136	 * multiple.
6137	 */
6138	p = NULL;
6139	cgroup_taskset_for_each_leader(leader, css, tset) {
6140		WARN_ON_ONCE(p);
6141		p = leader;
6142		memcg = mem_cgroup_from_css(css);
6143	}
6144	if (!p)
6145		return 0;
6146
6147	/*
6148	 * We are now committed to this value whatever it is. Changes in this
6149	 * tunable will only affect upcoming migrations, not the current one.
6150	 * So we need to save it, and keep it going.
6151	 */
6152	move_flags = READ_ONCE(memcg->move_charge_at_immigrate);
6153	if (!move_flags)
6154		return 0;
6155
6156	from = mem_cgroup_from_task(p);
6157
6158	VM_BUG_ON(from == memcg);
6159
6160	mm = get_task_mm(p);
6161	if (!mm)
6162		return 0;
6163	/* We move charges only when we move a owner of the mm */
6164	if (mm->owner == p) {
6165		VM_BUG_ON(mc.from);
6166		VM_BUG_ON(mc.to);
6167		VM_BUG_ON(mc.precharge);
6168		VM_BUG_ON(mc.moved_charge);
6169		VM_BUG_ON(mc.moved_swap);
6170
6171		spin_lock(&mc.lock);
6172		mc.mm = mm;
6173		mc.from = from;
6174		mc.to = memcg;
6175		mc.flags = move_flags;
6176		spin_unlock(&mc.lock);
6177		/* We set mc.moving_task later */
6178
6179		ret = mem_cgroup_precharge_mc(mm);
6180		if (ret)
6181			mem_cgroup_clear_mc();
6182	} else {
6183		mmput(mm);
6184	}
6185	return ret;
6186}
6187
6188static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
6189{
6190	if (mc.to)
6191		mem_cgroup_clear_mc();
6192}
6193
6194static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
6195				unsigned long addr, unsigned long end,
6196				struct mm_walk *walk)
6197{
6198	int ret = 0;
6199	struct vm_area_struct *vma = walk->vma;
6200	pte_t *pte;
6201	spinlock_t *ptl;
6202	enum mc_target_type target_type;
6203	union mc_target target;
6204	struct page *page;
6205
6206	ptl = pmd_trans_huge_lock(pmd, vma);
6207	if (ptl) {
6208		if (mc.precharge < HPAGE_PMD_NR) {
6209			spin_unlock(ptl);
6210			return 0;
6211		}
6212		target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
6213		if (target_type == MC_TARGET_PAGE) {
6214			page = target.page;
6215			if (isolate_lru_page(page)) {
6216				if (!mem_cgroup_move_account(page, true,
6217							     mc.from, mc.to)) {
6218					mc.precharge -= HPAGE_PMD_NR;
6219					mc.moved_charge += HPAGE_PMD_NR;
6220				}
6221				putback_lru_page(page);
6222			}
6223			unlock_page(page);
6224			put_page(page);
6225		} else if (target_type == MC_TARGET_DEVICE) {
6226			page = target.page;
6227			if (!mem_cgroup_move_account(page, true,
6228						     mc.from, mc.to)) {
6229				mc.precharge -= HPAGE_PMD_NR;
6230				mc.moved_charge += HPAGE_PMD_NR;
6231			}
6232			unlock_page(page);
6233			put_page(page);
6234		}
6235		spin_unlock(ptl);
6236		return 0;
6237	}
6238
6239retry:
6240	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
6241	if (!pte)
6242		return 0;
6243	for (; addr != end; addr += PAGE_SIZE) {
6244		pte_t ptent = ptep_get(pte++);
6245		bool device = false;
6246		swp_entry_t ent;
6247
6248		if (!mc.precharge)
6249			break;
6250
6251		switch (get_mctgt_type(vma, addr, ptent, &target)) {
6252		case MC_TARGET_DEVICE:
6253			device = true;
6254			fallthrough;
6255		case MC_TARGET_PAGE:
6256			page = target.page;
6257			/*
6258			 * We can have a part of the split pmd here. Moving it
6259			 * can be done but it would be too convoluted so simply
6260			 * ignore such a partial THP and keep it in original
6261			 * memcg. There should be somebody mapping the head.
6262			 */
6263			if (PageTransCompound(page))
6264				goto put;
6265			if (!device && !isolate_lru_page(page))
6266				goto put;
6267			if (!mem_cgroup_move_account(page, false,
6268						mc.from, mc.to)) {
6269				mc.precharge--;
6270				/* we uncharge from mc.from later. */
6271				mc.moved_charge++;
6272			}
6273			if (!device)
6274				putback_lru_page(page);
6275put:			/* get_mctgt_type() gets & locks the page */
6276			unlock_page(page);
6277			put_page(page);
6278			break;
6279		case MC_TARGET_SWAP:
6280			ent = target.ent;
6281			if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
6282				mc.precharge--;
6283				mem_cgroup_id_get_many(mc.to, 1);
6284				/* we fixup other refcnts and charges later. */
6285				mc.moved_swap++;
6286			}
6287			break;
6288		default:
6289			break;
6290		}
6291	}
6292	pte_unmap_unlock(pte - 1, ptl);
6293	cond_resched();
6294
6295	if (addr != end) {
6296		/*
6297		 * We have consumed all precharges we got in can_attach().
6298		 * We try charge one by one, but don't do any additional
6299		 * charges to mc.to if we have failed in charge once in attach()
6300		 * phase.
6301		 */
6302		ret = mem_cgroup_do_precharge(1);
6303		if (!ret)
6304			goto retry;
6305	}
6306
6307	return ret;
6308}
6309
6310static const struct mm_walk_ops charge_walk_ops = {
6311	.pmd_entry	= mem_cgroup_move_charge_pte_range,
6312	.walk_lock	= PGWALK_RDLOCK,
6313};
6314
6315static void mem_cgroup_move_charge(void)
6316{
6317	lru_add_drain_all();
6318	/*
6319	 * Signal folio_memcg_lock() to take the memcg's move_lock
6320	 * while we're moving its pages to another memcg. Then wait
6321	 * for already started RCU-only updates to finish.
6322	 */
6323	atomic_inc(&mc.from->moving_account);
6324	synchronize_rcu();
6325retry:
6326	if (unlikely(!mmap_read_trylock(mc.mm))) {
6327		/*
6328		 * Someone who are holding the mmap_lock might be waiting in
6329		 * waitq. So we cancel all extra charges, wake up all waiters,
6330		 * and retry. Because we cancel precharges, we might not be able
6331		 * to move enough charges, but moving charge is a best-effort
6332		 * feature anyway, so it wouldn't be a big problem.
6333		 */
6334		__mem_cgroup_clear_mc();
6335		cond_resched();
6336		goto retry;
6337	}
6338	/*
6339	 * When we have consumed all precharges and failed in doing
6340	 * additional charge, the page walk just aborts.
6341	 */
6342	walk_page_range(mc.mm, 0, ULONG_MAX, &charge_walk_ops, NULL);
6343	mmap_read_unlock(mc.mm);
6344	atomic_dec(&mc.from->moving_account);
6345}
6346
6347static void mem_cgroup_move_task(void)
6348{
6349	if (mc.to) {
6350		mem_cgroup_move_charge();
6351		mem_cgroup_clear_mc();
6352	}
6353}
6354#else	/* !CONFIG_MMU */
6355static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
6356{
6357	return 0;
6358}
6359static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
6360{
6361}
6362static void mem_cgroup_move_task(void)
6363{
6364}
6365#endif
6366
6367#ifdef CONFIG_LRU_GEN
6368static void mem_cgroup_attach(struct cgroup_taskset *tset)
6369{
6370	struct task_struct *task;
6371	struct cgroup_subsys_state *css;
6372
6373	/* find the first leader if there is any */
6374	cgroup_taskset_for_each_leader(task, css, tset)
6375		break;
6376
6377	if (!task)
6378		return;
6379
6380	task_lock(task);
6381	if (task->mm && READ_ONCE(task->mm->owner) == task)
6382		lru_gen_migrate_mm(task->mm);
6383	task_unlock(task);
6384}
6385#else
6386static void mem_cgroup_attach(struct cgroup_taskset *tset)
6387{
6388}
6389#endif /* CONFIG_LRU_GEN */
6390
6391static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
6392{
6393	if (value == PAGE_COUNTER_MAX)
6394		seq_puts(m, "max\n");
6395	else
6396		seq_printf(m, "%llu\n", (u64)value * PAGE_SIZE);
6397
6398	return 0;
6399}
6400
6401static u64 memory_current_read(struct cgroup_subsys_state *css,
6402			       struct cftype *cft)
6403{
6404	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6405
6406	return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE;
6407}
6408
6409static u64 memory_peak_read(struct cgroup_subsys_state *css,
6410			    struct cftype *cft)
6411{
6412	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6413
6414	return (u64)memcg->memory.watermark * PAGE_SIZE;
6415}
6416
6417static int memory_min_show(struct seq_file *m, void *v)
6418{
6419	return seq_puts_memcg_tunable(m,
6420		READ_ONCE(mem_cgroup_from_seq(m)->memory.min));
6421}
6422
6423static ssize_t memory_min_write(struct kernfs_open_file *of,
6424				char *buf, size_t nbytes, loff_t off)
6425{
6426	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6427	unsigned long min;
6428	int err;
6429
6430	buf = strstrip(buf);
6431	err = page_counter_memparse(buf, "max", &min);
6432	if (err)
6433		return err;
6434
6435	page_counter_set_min(&memcg->memory, min);
6436
6437	return nbytes;
6438}
6439
6440static int memory_low_show(struct seq_file *m, void *v)
6441{
6442	return seq_puts_memcg_tunable(m,
6443		READ_ONCE(mem_cgroup_from_seq(m)->memory.low));
6444}
6445
6446static ssize_t memory_low_write(struct kernfs_open_file *of,
6447				char *buf, size_t nbytes, loff_t off)
6448{
6449	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6450	unsigned long low;
6451	int err;
6452
6453	buf = strstrip(buf);
6454	err = page_counter_memparse(buf, "max", &low);
6455	if (err)
6456		return err;
6457
6458	page_counter_set_low(&memcg->memory, low);
6459
6460	return nbytes;
6461}
6462
6463static int memory_high_show(struct seq_file *m, void *v)
6464{
6465	return seq_puts_memcg_tunable(m,
6466		READ_ONCE(mem_cgroup_from_seq(m)->memory.high));
6467}
6468
6469static ssize_t memory_high_write(struct kernfs_open_file *of,
6470				 char *buf, size_t nbytes, loff_t off)
6471{
6472	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6473	unsigned int nr_retries = MAX_RECLAIM_RETRIES;
6474	bool drained = false;
6475	unsigned long high;
6476	int err;
6477
6478	buf = strstrip(buf);
6479	err = page_counter_memparse(buf, "max", &high);
6480	if (err)
6481		return err;
6482
6483	page_counter_set_high(&memcg->memory, high);
6484
6485	for (;;) {
6486		unsigned long nr_pages = page_counter_read(&memcg->memory);
6487		unsigned long reclaimed;
6488
6489		if (nr_pages <= high)
6490			break;
6491
6492		if (signal_pending(current))
6493			break;
6494
6495		if (!drained) {
6496			drain_all_stock(memcg);
6497			drained = true;
6498			continue;
6499		}
6500
6501		reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
6502					GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP);
6503
6504		if (!reclaimed && !nr_retries--)
6505			break;
6506	}
6507
6508	memcg_wb_domain_size_changed(memcg);
6509	return nbytes;
6510}
6511
6512static int memory_max_show(struct seq_file *m, void *v)
6513{
6514	return seq_puts_memcg_tunable(m,
6515		READ_ONCE(mem_cgroup_from_seq(m)->memory.max));
6516}
6517
6518static ssize_t memory_max_write(struct kernfs_open_file *of,
6519				char *buf, size_t nbytes, loff_t off)
6520{
6521	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6522	unsigned int nr_reclaims = MAX_RECLAIM_RETRIES;
6523	bool drained = false;
6524	unsigned long max;
6525	int err;
6526
6527	buf = strstrip(buf);
6528	err = page_counter_memparse(buf, "max", &max);
6529	if (err)
6530		return err;
6531
6532	xchg(&memcg->memory.max, max);
6533
6534	for (;;) {
6535		unsigned long nr_pages = page_counter_read(&memcg->memory);
6536
6537		if (nr_pages <= max)
6538			break;
6539
6540		if (signal_pending(current))
6541			break;
6542
6543		if (!drained) {
6544			drain_all_stock(memcg);
6545			drained = true;
6546			continue;
6547		}
6548
6549		if (nr_reclaims) {
6550			if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
6551					GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP))
6552				nr_reclaims--;
6553			continue;
6554		}
6555
6556		memcg_memory_event(memcg, MEMCG_OOM);
6557		if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0))
6558			break;
6559	}
6560
6561	memcg_wb_domain_size_changed(memcg);
6562	return nbytes;
6563}
6564
6565static void __memory_events_show(struct seq_file *m, atomic_long_t *events)
6566{
6567	seq_printf(m, "low %lu\n", atomic_long_read(&events[MEMCG_LOW]));
6568	seq_printf(m, "high %lu\n", atomic_long_read(&events[MEMCG_HIGH]));
6569	seq_printf(m, "max %lu\n", atomic_long_read(&events[MEMCG_MAX]));
6570	seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM]));
6571	seq_printf(m, "oom_kill %lu\n",
6572		   atomic_long_read(&events[MEMCG_OOM_KILL]));
6573	seq_printf(m, "oom_group_kill %lu\n",
6574		   atomic_long_read(&events[MEMCG_OOM_GROUP_KILL]));
6575}
6576
6577static int memory_events_show(struct seq_file *m, void *v)
6578{
6579	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6580
6581	__memory_events_show(m, memcg->memory_events);
6582	return 0;
6583}
6584
6585static int memory_events_local_show(struct seq_file *m, void *v)
6586{
6587	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6588
6589	__memory_events_show(m, memcg->memory_events_local);
6590	return 0;
6591}
6592
6593static int memory_stat_show(struct seq_file *m, void *v)
6594{
6595	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6596	char *buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
6597	struct seq_buf s;
6598
6599	if (!buf)
6600		return -ENOMEM;
6601	seq_buf_init(&s, buf, PAGE_SIZE);
6602	memory_stat_format(memcg, &s);
6603	seq_puts(m, buf);
6604	kfree(buf);
6605	return 0;
6606}
6607
6608#ifdef CONFIG_NUMA
6609static inline unsigned long lruvec_page_state_output(struct lruvec *lruvec,
6610						     int item)
6611{
6612	return lruvec_page_state(lruvec, item) * memcg_page_state_unit(item);
6613}
6614
6615static int memory_numa_stat_show(struct seq_file *m, void *v)
6616{
6617	int i;
6618	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6619
6620	mem_cgroup_flush_stats();
6621
6622	for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
6623		int nid;
6624
6625		if (memory_stats[i].idx >= NR_VM_NODE_STAT_ITEMS)
6626			continue;
6627
6628		seq_printf(m, "%s", memory_stats[i].name);
6629		for_each_node_state(nid, N_MEMORY) {
6630			u64 size;
6631			struct lruvec *lruvec;
6632
6633			lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
6634			size = lruvec_page_state_output(lruvec,
6635							memory_stats[i].idx);
6636			seq_printf(m, " N%d=%llu", nid, size);
6637		}
6638		seq_putc(m, '\n');
6639	}
6640
6641	return 0;
6642}
6643#endif
6644
6645static int memory_oom_group_show(struct seq_file *m, void *v)
6646{
6647	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6648
6649	seq_printf(m, "%d\n", READ_ONCE(memcg->oom_group));
6650
6651	return 0;
6652}
6653
6654static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
6655				      char *buf, size_t nbytes, loff_t off)
6656{
6657	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6658	int ret, oom_group;
6659
6660	buf = strstrip(buf);
6661	if (!buf)
6662		return -EINVAL;
6663
6664	ret = kstrtoint(buf, 0, &oom_group);
6665	if (ret)
6666		return ret;
6667
6668	if (oom_group != 0 && oom_group != 1)
6669		return -EINVAL;
6670
6671	WRITE_ONCE(memcg->oom_group, oom_group);
6672
6673	return nbytes;
6674}
6675
6676static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
6677			      size_t nbytes, loff_t off)
6678{
6679	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6680	unsigned int nr_retries = MAX_RECLAIM_RETRIES;
6681	unsigned long nr_to_reclaim, nr_reclaimed = 0;
6682	unsigned int reclaim_options;
6683	int err;
6684
6685	buf = strstrip(buf);
6686	err = page_counter_memparse(buf, "", &nr_to_reclaim);
6687	if (err)
6688		return err;
6689
6690	reclaim_options	= MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE;
6691	while (nr_reclaimed < nr_to_reclaim) {
6692		unsigned long reclaimed;
6693
6694		if (signal_pending(current))
6695			return -EINTR;
6696
6697		/*
6698		 * This is the final attempt, drain percpu lru caches in the
6699		 * hope of introducing more evictable pages for
6700		 * try_to_free_mem_cgroup_pages().
6701		 */
6702		if (!nr_retries)
6703			lru_add_drain_all();
6704
6705		reclaimed = try_to_free_mem_cgroup_pages(memcg,
6706					min(nr_to_reclaim - nr_reclaimed, SWAP_CLUSTER_MAX),
6707					GFP_KERNEL, reclaim_options);
6708
6709		if (!reclaimed && !nr_retries--)
6710			return -EAGAIN;
6711
6712		nr_reclaimed += reclaimed;
6713	}
6714
6715	return nbytes;
6716}
6717
6718static struct cftype memory_files[] = {
6719	{
6720		.name = "current",
6721		.flags = CFTYPE_NOT_ON_ROOT,
6722		.read_u64 = memory_current_read,
6723	},
6724	{
6725		.name = "peak",
6726		.flags = CFTYPE_NOT_ON_ROOT,
6727		.read_u64 = memory_peak_read,
6728	},
6729	{
6730		.name = "min",
6731		.flags = CFTYPE_NOT_ON_ROOT,
6732		.seq_show = memory_min_show,
6733		.write = memory_min_write,
6734	},
6735	{
6736		.name = "low",
6737		.flags = CFTYPE_NOT_ON_ROOT,
6738		.seq_show = memory_low_show,
6739		.write = memory_low_write,
6740	},
6741	{
6742		.name = "high",
6743		.flags = CFTYPE_NOT_ON_ROOT,
6744		.seq_show = memory_high_show,
6745		.write = memory_high_write,
6746	},
6747	{
6748		.name = "max",
6749		.flags = CFTYPE_NOT_ON_ROOT,
6750		.seq_show = memory_max_show,
6751		.write = memory_max_write,
6752	},
6753	{
6754		.name = "events",
6755		.flags = CFTYPE_NOT_ON_ROOT,
6756		.file_offset = offsetof(struct mem_cgroup, events_file),
6757		.seq_show = memory_events_show,
6758	},
6759	{
6760		.name = "events.local",
6761		.flags = CFTYPE_NOT_ON_ROOT,
6762		.file_offset = offsetof(struct mem_cgroup, events_local_file),
6763		.seq_show = memory_events_local_show,
6764	},
6765	{
6766		.name = "stat",
6767		.seq_show = memory_stat_show,
6768	},
6769#ifdef CONFIG_NUMA
6770	{
6771		.name = "numa_stat",
6772		.seq_show = memory_numa_stat_show,
6773	},
6774#endif
6775	{
6776		.name = "oom.group",
6777		.flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,
6778		.seq_show = memory_oom_group_show,
6779		.write = memory_oom_group_write,
6780	},
6781	{
6782		.name = "reclaim",
6783		.flags = CFTYPE_NS_DELEGATABLE,
6784		.write = memory_reclaim,
6785	},
6786	{ }	/* terminate */
6787};
6788
6789struct cgroup_subsys memory_cgrp_subsys = {
6790	.css_alloc = mem_cgroup_css_alloc,
6791	.css_online = mem_cgroup_css_online,
6792	.css_offline = mem_cgroup_css_offline,
6793	.css_released = mem_cgroup_css_released,
6794	.css_free = mem_cgroup_css_free,
6795	.css_reset = mem_cgroup_css_reset,
6796	.css_rstat_flush = mem_cgroup_css_rstat_flush,
6797	.can_attach = mem_cgroup_can_attach,
6798	.attach = mem_cgroup_attach,
6799	.cancel_attach = mem_cgroup_cancel_attach,
6800	.post_attach = mem_cgroup_move_task,
6801	.dfl_cftypes = memory_files,
6802	.legacy_cftypes = mem_cgroup_legacy_files,
6803	.early_init = 0,
6804};
6805
6806/*
6807 * This function calculates an individual cgroup's effective
6808 * protection which is derived from its own memory.min/low, its
6809 * parent's and siblings' settings, as well as the actual memory
6810 * distribution in the tree.
6811 *
6812 * The following rules apply to the effective protection values:
6813 *
6814 * 1. At the first level of reclaim, effective protection is equal to
6815 *    the declared protection in memory.min and memory.low.
6816 *
6817 * 2. To enable safe delegation of the protection configuration, at
6818 *    subsequent levels the effective protection is capped to the
6819 *    parent's effective protection.
6820 *
6821 * 3. To make complex and dynamic subtrees easier to configure, the
6822 *    user is allowed to overcommit the declared protection at a given
6823 *    level. If that is the case, the parent's effective protection is
6824 *    distributed to the children in proportion to how much protection
6825 *    they have declared and how much of it they are utilizing.
6826 *
6827 *    This makes distribution proportional, but also work-conserving:
6828 *    if one cgroup claims much more protection than it uses memory,
6829 *    the unused remainder is available to its siblings.
6830 *
6831 * 4. Conversely, when the declared protection is undercommitted at a
6832 *    given level, the distribution of the larger parental protection
6833 *    budget is NOT proportional. A cgroup's protection from a sibling
6834 *    is capped to its own memory.min/low setting.
6835 *
6836 * 5. However, to allow protecting recursive subtrees from each other
6837 *    without having to declare each individual cgroup's fixed share
6838 *    of the ancestor's claim to protection, any unutilized -
6839 *    "floating" - protection from up the tree is distributed in
6840 *    proportion to each cgroup's *usage*. This makes the protection
6841 *    neutral wrt sibling cgroups and lets them compete freely over
6842 *    the shared parental protection budget, but it protects the
6843 *    subtree as a whole from neighboring subtrees.
6844 *
6845 * Note that 4. and 5. are not in conflict: 4. is about protecting
6846 * against immediate siblings whereas 5. is about protecting against
6847 * neighboring subtrees.
6848 */
6849static unsigned long effective_protection(unsigned long usage,
6850					  unsigned long parent_usage,
6851					  unsigned long setting,
6852					  unsigned long parent_effective,
6853					  unsigned long siblings_protected)
6854{
6855	unsigned long protected;
6856	unsigned long ep;
6857
6858	protected = min(usage, setting);
6859	/*
6860	 * If all cgroups at this level combined claim and use more
6861	 * protection than what the parent affords them, distribute
6862	 * shares in proportion to utilization.
6863	 *
6864	 * We are using actual utilization rather than the statically
6865	 * claimed protection in order to be work-conserving: claimed
6866	 * but unused protection is available to siblings that would
6867	 * otherwise get a smaller chunk than what they claimed.
6868	 */
6869	if (siblings_protected > parent_effective)
6870		return protected * parent_effective / siblings_protected;
6871
6872	/*
6873	 * Ok, utilized protection of all children is within what the
6874	 * parent affords them, so we know whatever this child claims
6875	 * and utilizes is effectively protected.
6876	 *
6877	 * If there is unprotected usage beyond this value, reclaim
6878	 * will apply pressure in proportion to that amount.
6879	 *
6880	 * If there is unutilized protection, the cgroup will be fully
6881	 * shielded from reclaim, but we do return a smaller value for
6882	 * protection than what the group could enjoy in theory. This
6883	 * is okay. With the overcommit distribution above, effective
6884	 * protection is always dependent on how memory is actually
6885	 * consumed among the siblings anyway.
6886	 */
6887	ep = protected;
6888
6889	/*
6890	 * If the children aren't claiming (all of) the protection
6891	 * afforded to them by the parent, distribute the remainder in
6892	 * proportion to the (unprotected) memory of each cgroup. That
6893	 * way, cgroups that aren't explicitly prioritized wrt each
6894	 * other compete freely over the allowance, but they are
6895	 * collectively protected from neighboring trees.
6896	 *
6897	 * We're using unprotected memory for the weight so that if
6898	 * some cgroups DO claim explicit protection, we don't protect
6899	 * the same bytes twice.
6900	 *
6901	 * Check both usage and parent_usage against the respective
6902	 * protected values. One should imply the other, but they
6903	 * aren't read atomically - make sure the division is sane.
6904	 */
6905	if (!(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT))
6906		return ep;
6907	if (parent_effective > siblings_protected &&
6908	    parent_usage > siblings_protected &&
6909	    usage > protected) {
6910		unsigned long unclaimed;
6911
6912		unclaimed = parent_effective - siblings_protected;
6913		unclaimed *= usage - protected;
6914		unclaimed /= parent_usage - siblings_protected;
6915
6916		ep += unclaimed;
6917	}
6918
6919	return ep;
6920}
6921
6922/**
6923 * mem_cgroup_calculate_protection - check if memory consumption is in the normal range
6924 * @root: the top ancestor of the sub-tree being checked
6925 * @memcg: the memory cgroup to check
6926 *
6927 * WARNING: This function is not stateless! It can only be used as part
6928 *          of a top-down tree iteration, not for isolated queries.
6929 */
6930void mem_cgroup_calculate_protection(struct mem_cgroup *root,
6931				     struct mem_cgroup *memcg)
6932{
6933	unsigned long usage, parent_usage;
6934	struct mem_cgroup *parent;
6935
6936	if (mem_cgroup_disabled())
6937		return;
6938
6939	if (!root)
6940		root = root_mem_cgroup;
6941
6942	/*
6943	 * Effective values of the reclaim targets are ignored so they
6944	 * can be stale. Have a look at mem_cgroup_protection for more
6945	 * details.
6946	 * TODO: calculation should be more robust so that we do not need
6947	 * that special casing.
6948	 */
6949	if (memcg == root)
6950		return;
6951
6952	usage = page_counter_read(&memcg->memory);
6953	if (!usage)
6954		return;
6955
6956	parent = parent_mem_cgroup(memcg);
6957
6958	if (parent == root) {
6959		memcg->memory.emin = READ_ONCE(memcg->memory.min);
6960		memcg->memory.elow = READ_ONCE(memcg->memory.low);
6961		return;
6962	}
6963
6964	parent_usage = page_counter_read(&parent->memory);
6965
6966	WRITE_ONCE(memcg->memory.emin, effective_protection(usage, parent_usage,
6967			READ_ONCE(memcg->memory.min),
6968			READ_ONCE(parent->memory.emin),
6969			atomic_long_read(&parent->memory.children_min_usage)));
6970
6971	WRITE_ONCE(memcg->memory.elow, effective_protection(usage, parent_usage,
6972			READ_ONCE(memcg->memory.low),
6973			READ_ONCE(parent->memory.elow),
6974			atomic_long_read(&parent->memory.children_low_usage)));
6975}
6976
6977static int charge_memcg(struct folio *folio, struct mem_cgroup *memcg,
6978			gfp_t gfp)
6979{
6980	long nr_pages = folio_nr_pages(folio);
6981	int ret;
6982
6983	ret = try_charge(memcg, gfp, nr_pages);
6984	if (ret)
6985		goto out;
6986
6987	css_get(&memcg->css);
6988	commit_charge(folio, memcg);
6989
6990	local_irq_disable();
6991	mem_cgroup_charge_statistics(memcg, nr_pages);
6992	memcg_check_events(memcg, folio_nid(folio));
6993	local_irq_enable();
6994out:
6995	return ret;
6996}
6997
6998int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp)
6999{
7000	struct mem_cgroup *memcg;
7001	int ret;
7002
7003	memcg = get_mem_cgroup_from_mm(mm);
7004	ret = charge_memcg(folio, memcg, gfp);
7005	css_put(&memcg->css);
7006
7007	return ret;
7008}
7009
7010/**
7011 * mem_cgroup_swapin_charge_folio - Charge a newly allocated folio for swapin.
7012 * @folio: folio to charge.
7013 * @mm: mm context of the victim
7014 * @gfp: reclaim mode
7015 * @entry: swap entry for which the folio is allocated
7016 *
7017 * This function charges a folio allocated for swapin. Please call this before
7018 * adding the folio to the swapcache.
7019 *
7020 * Returns 0 on success. Otherwise, an error code is returned.
7021 */
7022int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
7023				  gfp_t gfp, swp_entry_t entry)
7024{
7025	struct mem_cgroup *memcg;
7026	unsigned short id;
7027	int ret;
7028
7029	if (mem_cgroup_disabled())
7030		return 0;
7031
7032	id = lookup_swap_cgroup_id(entry);
7033	rcu_read_lock();
7034	memcg = mem_cgroup_from_id(id);
7035	if (!memcg || !css_tryget_online(&memcg->css))
7036		memcg = get_mem_cgroup_from_mm(mm);
7037	rcu_read_unlock();
7038
7039	ret = charge_memcg(folio, memcg, gfp);
7040
7041	css_put(&memcg->css);
7042	return ret;
7043}
7044
7045/*
7046 * mem_cgroup_swapin_uncharge_swap - uncharge swap slot
7047 * @entry: swap entry for which the page is charged
7048 *
7049 * Call this function after successfully adding the charged page to swapcache.
7050 *
7051 * Note: This function assumes the page for which swap slot is being uncharged
7052 * is order 0 page.
7053 */
7054void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry)
7055{
7056	/*
7057	 * Cgroup1's unified memory+swap counter has been charged with the
7058	 * new swapcache page, finish the transfer by uncharging the swap
7059	 * slot. The swap slot would also get uncharged when it dies, but
7060	 * it can stick around indefinitely and we'd count the page twice
7061	 * the entire time.
7062	 *
7063	 * Cgroup2 has separate resource counters for memory and swap,
7064	 * so this is a non-issue here. Memory and swap charge lifetimes
7065	 * correspond 1:1 to page and swap slot lifetimes: we charge the
7066	 * page to memory here, and uncharge swap when the slot is freed.
7067	 */
7068	if (!mem_cgroup_disabled() && do_memsw_account()) {
7069		/*
7070		 * The swap entry might not get freed for a long time,
7071		 * let's not wait for it.  The page already received a
7072		 * memory+swap charge, drop the swap entry duplicate.
7073		 */
7074		mem_cgroup_uncharge_swap(entry, 1);
7075	}
7076}
7077
7078struct uncharge_gather {
7079	struct mem_cgroup *memcg;
7080	unsigned long nr_memory;
7081	unsigned long pgpgout;
7082	unsigned long nr_kmem;
7083	int nid;
7084};
7085
7086static inline void uncharge_gather_clear(struct uncharge_gather *ug)
7087{
7088	memset(ug, 0, sizeof(*ug));
7089}
7090
7091static void uncharge_batch(const struct uncharge_gather *ug)
7092{
7093	unsigned long flags;
7094
7095	if (ug->nr_memory) {
7096		page_counter_uncharge(&ug->memcg->memory, ug->nr_memory);
7097		if (do_memsw_account())
7098			page_counter_uncharge(&ug->memcg->memsw, ug->nr_memory);
7099		if (ug->nr_kmem)
7100			memcg_account_kmem(ug->memcg, -ug->nr_kmem);
7101		memcg_oom_recover(ug->memcg);
7102	}
7103
7104	local_irq_save(flags);
7105	__count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
7106	__this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_memory);
7107	memcg_check_events(ug->memcg, ug->nid);
7108	local_irq_restore(flags);
7109
7110	/* drop reference from uncharge_folio */
7111	css_put(&ug->memcg->css);
7112}
7113
7114static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
7115{
7116	long nr_pages;
7117	struct mem_cgroup *memcg;
7118	struct obj_cgroup *objcg;
7119
7120	VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
7121
7122	/*
7123	 * Nobody should be changing or seriously looking at
7124	 * folio memcg or objcg at this point, we have fully
7125	 * exclusive access to the folio.
7126	 */
7127	if (folio_memcg_kmem(folio)) {
7128		objcg = __folio_objcg(folio);
7129		/*
7130		 * This get matches the put at the end of the function and
7131		 * kmem pages do not hold memcg references anymore.
7132		 */
7133		memcg = get_mem_cgroup_from_objcg(objcg);
7134	} else {
7135		memcg = __folio_memcg(folio);
7136	}
7137
7138	if (!memcg)
7139		return;
7140
7141	if (ug->memcg != memcg) {
7142		if (ug->memcg) {
7143			uncharge_batch(ug);
7144			uncharge_gather_clear(ug);
7145		}
7146		ug->memcg = memcg;
7147		ug->nid = folio_nid(folio);
7148
7149		/* pairs with css_put in uncharge_batch */
7150		css_get(&memcg->css);
7151	}
7152
7153	nr_pages = folio_nr_pages(folio);
7154
7155	if (folio_memcg_kmem(folio)) {
7156		ug->nr_memory += nr_pages;
7157		ug->nr_kmem += nr_pages;
7158
7159		folio->memcg_data = 0;
7160		obj_cgroup_put(objcg);
7161	} else {
7162		/* LRU pages aren't accounted at the root level */
7163		if (!mem_cgroup_is_root(memcg))
7164			ug->nr_memory += nr_pages;
7165		ug->pgpgout++;
7166
7167		folio->memcg_data = 0;
7168	}
7169
7170	css_put(&memcg->css);
7171}
7172
7173void __mem_cgroup_uncharge(struct folio *folio)
7174{
7175	struct uncharge_gather ug;
7176
7177	/* Don't touch folio->lru of any random page, pre-check: */
7178	if (!folio_memcg(folio))
7179		return;
7180
7181	uncharge_gather_clear(&ug);
7182	uncharge_folio(folio, &ug);
7183	uncharge_batch(&ug);
7184}
7185
7186/**
7187 * __mem_cgroup_uncharge_list - uncharge a list of page
7188 * @page_list: list of pages to uncharge
7189 *
7190 * Uncharge a list of pages previously charged with
7191 * __mem_cgroup_charge().
7192 */
7193void __mem_cgroup_uncharge_list(struct list_head *page_list)
7194{
7195	struct uncharge_gather ug;
7196	struct folio *folio;
7197
7198	uncharge_gather_clear(&ug);
7199	list_for_each_entry(folio, page_list, lru)
7200		uncharge_folio(folio, &ug);
7201	if (ug.memcg)
7202		uncharge_batch(&ug);
7203}
7204
7205/**
7206 * mem_cgroup_migrate - Charge a folio's replacement.
7207 * @old: Currently circulating folio.
7208 * @new: Replacement folio.
7209 *
7210 * Charge @new as a replacement folio for @old. @old will
7211 * be uncharged upon free.
7212 *
7213 * Both folios must be locked, @new->mapping must be set up.
7214 */
7215void mem_cgroup_migrate(struct folio *old, struct folio *new)
7216{
7217	struct mem_cgroup *memcg;
7218	long nr_pages = folio_nr_pages(new);
7219	unsigned long flags;
7220
7221	VM_BUG_ON_FOLIO(!folio_test_locked(old), old);
7222	VM_BUG_ON_FOLIO(!folio_test_locked(new), new);
7223	VM_BUG_ON_FOLIO(folio_test_anon(old) != folio_test_anon(new), new);
7224	VM_BUG_ON_FOLIO(folio_nr_pages(old) != nr_pages, new);
7225
7226	if (mem_cgroup_disabled())
7227		return;
7228
7229	/* Page cache replacement: new folio already charged? */
7230	if (folio_memcg(new))
7231		return;
7232
7233	memcg = folio_memcg(old);
7234	VM_WARN_ON_ONCE_FOLIO(!memcg, old);
7235	if (!memcg)
7236		return;
7237
7238	/* Force-charge the new page. The old one will be freed soon */
7239	if (!mem_cgroup_is_root(memcg)) {
7240		page_counter_charge(&memcg->memory, nr_pages);
7241		if (do_memsw_account())
7242			page_counter_charge(&memcg->memsw, nr_pages);
7243	}
7244
7245	css_get(&memcg->css);
7246	commit_charge(new, memcg);
7247
7248	local_irq_save(flags);
7249	mem_cgroup_charge_statistics(memcg, nr_pages);
7250	memcg_check_events(memcg, folio_nid(new));
7251	local_irq_restore(flags);
7252}
7253
7254DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
7255EXPORT_SYMBOL(memcg_sockets_enabled_key);
7256
7257void mem_cgroup_sk_alloc(struct sock *sk)
7258{
7259	struct mem_cgroup *memcg;
7260
7261	if (!mem_cgroup_sockets_enabled)
7262		return;
7263
7264	/* Do not associate the sock with unrelated interrupted task's memcg. */
7265	if (!in_task())
7266		return;
7267
7268	rcu_read_lock();
7269	memcg = mem_cgroup_from_task(current);
7270	if (mem_cgroup_is_root(memcg))
7271		goto out;
7272	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active)
7273		goto out;
7274	if (css_tryget(&memcg->css))
7275		sk->sk_memcg = memcg;
7276out:
7277	rcu_read_unlock();
7278}
7279
7280void mem_cgroup_sk_free(struct sock *sk)
7281{
7282	if (sk->sk_memcg)
7283		css_put(&sk->sk_memcg->css);
7284}
7285
7286/**
7287 * mem_cgroup_charge_skmem - charge socket memory
7288 * @memcg: memcg to charge
7289 * @nr_pages: number of pages to charge
7290 * @gfp_mask: reclaim mode
7291 *
7292 * Charges @nr_pages to @memcg. Returns %true if the charge fit within
7293 * @memcg's configured limit, %false if it doesn't.
7294 */
7295bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages,
7296			     gfp_t gfp_mask)
7297{
7298	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
7299		struct page_counter *fail;
7300
7301		if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) {
7302			memcg->tcpmem_pressure = 0;
7303			return true;
7304		}
7305		memcg->tcpmem_pressure = 1;
7306		if (gfp_mask & __GFP_NOFAIL) {
7307			page_counter_charge(&memcg->tcpmem, nr_pages);
7308			return true;
7309		}
7310		return false;
7311	}
7312
7313	if (try_charge(memcg, gfp_mask, nr_pages) == 0) {
7314		mod_memcg_state(memcg, MEMCG_SOCK, nr_pages);
7315		return true;
7316	}
7317
7318	return false;
7319}
7320
7321/**
7322 * mem_cgroup_uncharge_skmem - uncharge socket memory
7323 * @memcg: memcg to uncharge
7324 * @nr_pages: number of pages to uncharge
7325 */
7326void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
7327{
7328	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
7329		page_counter_uncharge(&memcg->tcpmem, nr_pages);
7330		return;
7331	}
7332
7333	mod_memcg_state(memcg, MEMCG_SOCK, -nr_pages);
7334
7335	refill_stock(memcg, nr_pages);
7336}
7337
7338static int __init cgroup_memory(char *s)
7339{
7340	char *token;
7341
7342	while ((token = strsep(&s, ",")) != NULL) {
7343		if (!*token)
7344			continue;
7345		if (!strcmp(token, "nosocket"))
7346			cgroup_memory_nosocket = true;
7347		if (!strcmp(token, "nokmem"))
7348			cgroup_memory_nokmem = true;
7349		if (!strcmp(token, "nobpf"))
7350			cgroup_memory_nobpf = true;
7351	}
7352	return 1;
7353}
7354__setup("cgroup.memory=", cgroup_memory);
7355
7356/*
7357 * subsys_initcall() for memory controller.
7358 *
7359 * Some parts like memcg_hotplug_cpu_dead() have to be initialized from this
7360 * context because of lock dependencies (cgroup_lock -> cpu hotplug) but
7361 * basically everything that doesn't depend on a specific mem_cgroup structure
7362 * should be initialized from here.
7363 */
7364static int __init mem_cgroup_init(void)
7365{
7366	int cpu, node;
7367
7368	/*
7369	 * Currently s32 type (can refer to struct batched_lruvec_stat) is
7370	 * used for per-memcg-per-cpu caching of per-node statistics. In order
7371	 * to work fine, we should make sure that the overfill threshold can't
7372	 * exceed S32_MAX / PAGE_SIZE.
7373	 */
7374	BUILD_BUG_ON(MEMCG_CHARGE_BATCH > S32_MAX / PAGE_SIZE);
7375
7376	cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
7377				  memcg_hotplug_cpu_dead);
7378
7379	for_each_possible_cpu(cpu)
7380		INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
7381			  drain_local_stock);
7382
7383	for_each_node(node) {
7384		struct mem_cgroup_tree_per_node *rtpn;
7385
7386		rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, node);
7387
7388		rtpn->rb_root = RB_ROOT;
7389		rtpn->rb_rightmost = NULL;
7390		spin_lock_init(&rtpn->lock);
7391		soft_limit_tree.rb_tree_per_node[node] = rtpn;
7392	}
7393
7394	return 0;
7395}
7396subsys_initcall(mem_cgroup_init);
7397
7398#ifdef CONFIG_SWAP
7399static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
7400{
7401	while (!refcount_inc_not_zero(&memcg->id.ref)) {
7402		/*
7403		 * The root cgroup cannot be destroyed, so it's refcount must
7404		 * always be >= 1.
7405		 */
7406		if (WARN_ON_ONCE(mem_cgroup_is_root(memcg))) {
7407			VM_BUG_ON(1);
7408			break;
7409		}
7410		memcg = parent_mem_cgroup(memcg);
7411		if (!memcg)
7412			memcg = root_mem_cgroup;
7413	}
7414	return memcg;
7415}
7416
7417/**
7418 * mem_cgroup_swapout - transfer a memsw charge to swap
7419 * @folio: folio whose memsw charge to transfer
7420 * @entry: swap entry to move the charge to
7421 *
7422 * Transfer the memsw charge of @folio to @entry.
7423 */
7424void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
7425{
7426	struct mem_cgroup *memcg, *swap_memcg;
7427	unsigned int nr_entries;
7428	unsigned short oldid;
7429
7430	VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
7431	VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
7432
7433	if (mem_cgroup_disabled())
7434		return;
7435
7436	if (!do_memsw_account())
7437		return;
7438
7439	memcg = folio_memcg(folio);
7440
7441	VM_WARN_ON_ONCE_FOLIO(!memcg, folio);
7442	if (!memcg)
7443		return;
7444
7445	/*
7446	 * In case the memcg owning these pages has been offlined and doesn't
7447	 * have an ID allocated to it anymore, charge the closest online
7448	 * ancestor for the swap instead and transfer the memory+swap charge.
7449	 */
7450	swap_memcg = mem_cgroup_id_get_online(memcg);
7451	nr_entries = folio_nr_pages(folio);
7452	/* Get references for the tail pages, too */
7453	if (nr_entries > 1)
7454		mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
7455	oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg),
7456				   nr_entries);
7457	VM_BUG_ON_FOLIO(oldid, folio);
7458	mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
7459
7460	folio->memcg_data = 0;
7461
7462	if (!mem_cgroup_is_root(memcg))
7463		page_counter_uncharge(&memcg->memory, nr_entries);
7464
7465	if (memcg != swap_memcg) {
7466		if (!mem_cgroup_is_root(swap_memcg))
7467			page_counter_charge(&swap_memcg->memsw, nr_entries);
7468		page_counter_uncharge(&memcg->memsw, nr_entries);
7469	}
7470
7471	/*
7472	 * Interrupts should be disabled here because the caller holds the
7473	 * i_pages lock which is taken with interrupts-off. It is
7474	 * important here to have the interrupts disabled because it is the
7475	 * only synchronisation we have for updating the per-CPU variables.
7476	 */
7477	memcg_stats_lock();
7478	mem_cgroup_charge_statistics(memcg, -nr_entries);
7479	memcg_stats_unlock();
7480	memcg_check_events(memcg, folio_nid(folio));
7481
7482	css_put(&memcg->css);
7483}
7484
7485/**
7486 * __mem_cgroup_try_charge_swap - try charging swap space for a folio
7487 * @folio: folio being added to swap
7488 * @entry: swap entry to charge
7489 *
7490 * Try to charge @folio's memcg for the swap space at @entry.
7491 *
7492 * Returns 0 on success, -ENOMEM on failure.
7493 */
7494int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry)
7495{
7496	unsigned int nr_pages = folio_nr_pages(folio);
7497	struct page_counter *counter;
7498	struct mem_cgroup *memcg;
7499	unsigned short oldid;
7500
7501	if (do_memsw_account())
7502		return 0;
7503
7504	memcg = folio_memcg(folio);
7505
7506	VM_WARN_ON_ONCE_FOLIO(!memcg, folio);
7507	if (!memcg)
7508		return 0;
7509
7510	if (!entry.val) {
7511		memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
7512		return 0;
7513	}
7514
7515	memcg = mem_cgroup_id_get_online(memcg);
7516
7517	if (!mem_cgroup_is_root(memcg) &&
7518	    !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) {
7519		memcg_memory_event(memcg, MEMCG_SWAP_MAX);
7520		memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
7521		mem_cgroup_id_put(memcg);
7522		return -ENOMEM;
7523	}
7524
7525	/* Get references for the tail pages, too */
7526	if (nr_pages > 1)
7527		mem_cgroup_id_get_many(memcg, nr_pages - 1);
7528	oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages);
7529	VM_BUG_ON_FOLIO(oldid, folio);
7530	mod_memcg_state(memcg, MEMCG_SWAP, nr_pages);
7531
7532	return 0;
7533}
7534
7535/**
7536 * __mem_cgroup_uncharge_swap - uncharge swap space
7537 * @entry: swap entry to uncharge
7538 * @nr_pages: the amount of swap space to uncharge
7539 */
7540void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
7541{
7542	struct mem_cgroup *memcg;
7543	unsigned short id;
7544
7545	id = swap_cgroup_record(entry, 0, nr_pages);
7546	rcu_read_lock();
7547	memcg = mem_cgroup_from_id(id);
7548	if (memcg) {
7549		if (!mem_cgroup_is_root(memcg)) {
7550			if (do_memsw_account())
7551				page_counter_uncharge(&memcg->memsw, nr_pages);
7552			else
7553				page_counter_uncharge(&memcg->swap, nr_pages);
7554		}
7555		mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages);
7556		mem_cgroup_id_put_many(memcg, nr_pages);
7557	}
7558	rcu_read_unlock();
7559}
7560
7561long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
7562{
7563	long nr_swap_pages = get_nr_swap_pages();
7564
7565	if (mem_cgroup_disabled() || do_memsw_account())
7566		return nr_swap_pages;
7567	for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg))
7568		nr_swap_pages = min_t(long, nr_swap_pages,
7569				      READ_ONCE(memcg->swap.max) -
7570				      page_counter_read(&memcg->swap));
7571	return nr_swap_pages;
7572}
7573
7574bool mem_cgroup_swap_full(struct folio *folio)
7575{
7576	struct mem_cgroup *memcg;
7577
7578	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
7579
7580	if (vm_swap_full())
7581		return true;
7582	if (do_memsw_account())
7583		return false;
7584
7585	memcg = folio_memcg(folio);
7586	if (!memcg)
7587		return false;
7588
7589	for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) {
7590		unsigned long usage = page_counter_read(&memcg->swap);
7591
7592		if (usage * 2 >= READ_ONCE(memcg->swap.high) ||
7593		    usage * 2 >= READ_ONCE(memcg->swap.max))
7594			return true;
7595	}
7596
7597	return false;
7598}
7599
7600static int __init setup_swap_account(char *s)
7601{
7602	pr_warn_once("The swapaccount= commandline option is deprecated. "
7603		     "Please report your usecase to linux-mm@kvack.org if you "
7604		     "depend on this functionality.\n");
7605	return 1;
7606}
7607__setup("swapaccount=", setup_swap_account);
7608
7609static u64 swap_current_read(struct cgroup_subsys_state *css,
7610			     struct cftype *cft)
7611{
7612	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
7613
7614	return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE;
7615}
7616
7617static u64 swap_peak_read(struct cgroup_subsys_state *css,
7618			  struct cftype *cft)
7619{
7620	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
7621
7622	return (u64)memcg->swap.watermark * PAGE_SIZE;
7623}
7624
7625static int swap_high_show(struct seq_file *m, void *v)
7626{
7627	return seq_puts_memcg_tunable(m,
7628		READ_ONCE(mem_cgroup_from_seq(m)->swap.high));
7629}
7630
7631static ssize_t swap_high_write(struct kernfs_open_file *of,
7632			       char *buf, size_t nbytes, loff_t off)
7633{
7634	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
7635	unsigned long high;
7636	int err;
7637
7638	buf = strstrip(buf);
7639	err = page_counter_memparse(buf, "max", &high);
7640	if (err)
7641		return err;
7642
7643	page_counter_set_high(&memcg->swap, high);
7644
7645	return nbytes;
7646}
7647
7648static int swap_max_show(struct seq_file *m, void *v)
7649{
7650	return seq_puts_memcg_tunable(m,
7651		READ_ONCE(mem_cgroup_from_seq(m)->swap.max));
7652}
7653
7654static ssize_t swap_max_write(struct kernfs_open_file *of,
7655			      char *buf, size_t nbytes, loff_t off)
7656{
7657	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
7658	unsigned long max;
7659	int err;
7660
7661	buf = strstrip(buf);
7662	err = page_counter_memparse(buf, "max", &max);
7663	if (err)
7664		return err;
7665
7666	xchg(&memcg->swap.max, max);
7667
7668	return nbytes;
7669}
7670
7671static int swap_events_show(struct seq_file *m, void *v)
7672{
7673	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
7674
7675	seq_printf(m, "high %lu\n",
7676		   atomic_long_read(&memcg->memory_events[MEMCG_SWAP_HIGH]));
7677	seq_printf(m, "max %lu\n",
7678		   atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX]));
7679	seq_printf(m, "fail %lu\n",
7680		   atomic_long_read(&memcg->memory_events[MEMCG_SWAP_FAIL]));
7681
7682	return 0;
7683}
7684
7685static struct cftype swap_files[] = {
7686	{
7687		.name = "swap.current",
7688		.flags = CFTYPE_NOT_ON_ROOT,
7689		.read_u64 = swap_current_read,
7690	},
7691	{
7692		.name = "swap.high",
7693		.flags = CFTYPE_NOT_ON_ROOT,
7694		.seq_show = swap_high_show,
7695		.write = swap_high_write,
7696	},
7697	{
7698		.name = "swap.max",
7699		.flags = CFTYPE_NOT_ON_ROOT,
7700		.seq_show = swap_max_show,
7701		.write = swap_max_write,
7702	},
7703	{
7704		.name = "swap.peak",
7705		.flags = CFTYPE_NOT_ON_ROOT,
7706		.read_u64 = swap_peak_read,
7707	},
7708	{
7709		.name = "swap.events",
7710		.flags = CFTYPE_NOT_ON_ROOT,
7711		.file_offset = offsetof(struct mem_cgroup, swap_events_file),
7712		.seq_show = swap_events_show,
7713	},
7714	{ }	/* terminate */
7715};
7716
7717static struct cftype memsw_files[] = {
7718	{
7719		.name = "memsw.usage_in_bytes",
7720		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
7721		.read_u64 = mem_cgroup_read_u64,
7722	},
7723	{
7724		.name = "memsw.max_usage_in_bytes",
7725		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
7726		.write = mem_cgroup_reset,
7727		.read_u64 = mem_cgroup_read_u64,
7728	},
7729	{
7730		.name = "memsw.limit_in_bytes",
7731		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
7732		.write = mem_cgroup_write,
7733		.read_u64 = mem_cgroup_read_u64,
7734	},
7735	{
7736		.name = "memsw.failcnt",
7737		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
7738		.write = mem_cgroup_reset,
7739		.read_u64 = mem_cgroup_read_u64,
7740	},
7741	{ },	/* terminate */
7742};
7743
7744#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
7745/**
7746 * obj_cgroup_may_zswap - check if this cgroup can zswap
7747 * @objcg: the object cgroup
7748 *
7749 * Check if the hierarchical zswap limit has been reached.
7750 *
7751 * This doesn't check for specific headroom, and it is not atomic
7752 * either. But with zswap, the size of the allocation is only known
7753 * once compression has occured, and this optimistic pre-check avoids
7754 * spending cycles on compression when there is already no room left
7755 * or zswap is disabled altogether somewhere in the hierarchy.
7756 */
7757bool obj_cgroup_may_zswap(struct obj_cgroup *objcg)
7758{
7759	struct mem_cgroup *memcg, *original_memcg;
7760	bool ret = true;
7761
7762	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
7763		return true;
7764
7765	original_memcg = get_mem_cgroup_from_objcg(objcg);
7766	for (memcg = original_memcg; !mem_cgroup_is_root(memcg);
7767	     memcg = parent_mem_cgroup(memcg)) {
7768		unsigned long max = READ_ONCE(memcg->zswap_max);
7769		unsigned long pages;
7770
7771		if (max == PAGE_COUNTER_MAX)
7772			continue;
7773		if (max == 0) {
7774			ret = false;
7775			break;
7776		}
7777
7778		cgroup_rstat_flush(memcg->css.cgroup);
7779		pages = memcg_page_state(memcg, MEMCG_ZSWAP_B) / PAGE_SIZE;
7780		if (pages < max)
7781			continue;
7782		ret = false;
7783		break;
7784	}
7785	mem_cgroup_put(original_memcg);
7786	return ret;
7787}
7788
7789/**
7790 * obj_cgroup_charge_zswap - charge compression backend memory
7791 * @objcg: the object cgroup
7792 * @size: size of compressed object
7793 *
7794 * This forces the charge after obj_cgroup_may_zswap() allowed
7795 * compression and storage in zwap for this cgroup to go ahead.
7796 */
7797void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size)
7798{
7799	struct mem_cgroup *memcg;
7800
7801	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
7802		return;
7803
7804	VM_WARN_ON_ONCE(!(current->flags & PF_MEMALLOC));
7805
7806	/* PF_MEMALLOC context, charging must succeed */
7807	if (obj_cgroup_charge(objcg, GFP_KERNEL, size))
7808		VM_WARN_ON_ONCE(1);
7809
7810	rcu_read_lock();
7811	memcg = obj_cgroup_memcg(objcg);
7812	mod_memcg_state(memcg, MEMCG_ZSWAP_B, size);
7813	mod_memcg_state(memcg, MEMCG_ZSWAPPED, 1);
7814	rcu_read_unlock();
7815}
7816
7817/**
7818 * obj_cgroup_uncharge_zswap - uncharge compression backend memory
7819 * @objcg: the object cgroup
7820 * @size: size of compressed object
7821 *
7822 * Uncharges zswap memory on page in.
7823 */
7824void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size)
7825{
7826	struct mem_cgroup *memcg;
7827
7828	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
7829		return;
7830
7831	obj_cgroup_uncharge(objcg, size);
7832
7833	rcu_read_lock();
7834	memcg = obj_cgroup_memcg(objcg);
7835	mod_memcg_state(memcg, MEMCG_ZSWAP_B, -size);
7836	mod_memcg_state(memcg, MEMCG_ZSWAPPED, -1);
7837	rcu_read_unlock();
7838}
7839
7840static u64 zswap_current_read(struct cgroup_subsys_state *css,
7841			      struct cftype *cft)
7842{
7843	cgroup_rstat_flush(css->cgroup);
7844	return memcg_page_state(mem_cgroup_from_css(css), MEMCG_ZSWAP_B);
7845}
7846
7847static int zswap_max_show(struct seq_file *m, void *v)
7848{
7849	return seq_puts_memcg_tunable(m,
7850		READ_ONCE(mem_cgroup_from_seq(m)->zswap_max));
7851}
7852
7853static ssize_t zswap_max_write(struct kernfs_open_file *of,
7854			       char *buf, size_t nbytes, loff_t off)
7855{
7856	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
7857	unsigned long max;
7858	int err;
7859
7860	buf = strstrip(buf);
7861	err = page_counter_memparse(buf, "max", &max);
7862	if (err)
7863		return err;
7864
7865	xchg(&memcg->zswap_max, max);
7866
7867	return nbytes;
7868}
7869
7870static struct cftype zswap_files[] = {
7871	{
7872		.name = "zswap.current",
7873		.flags = CFTYPE_NOT_ON_ROOT,
7874		.read_u64 = zswap_current_read,
7875	},
7876	{
7877		.name = "zswap.max",
7878		.flags = CFTYPE_NOT_ON_ROOT,
7879		.seq_show = zswap_max_show,
7880		.write = zswap_max_write,
7881	},
7882	{ }	/* terminate */
7883};
7884#endif /* CONFIG_MEMCG_KMEM && CONFIG_ZSWAP */
7885
7886static int __init mem_cgroup_swap_init(void)
7887{
7888	if (mem_cgroup_disabled())
7889		return 0;
7890
7891	WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, swap_files));
7892	WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, memsw_files));
7893#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
7894	WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, zswap_files));
7895#endif
7896	return 0;
7897}
7898subsys_initcall(mem_cgroup_swap_init);
7899
7900#endif /* CONFIG_SWAP */