mm/shmem.c at v6.19 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / mm / shmem.c
at v6.19 6031 lines 163 kB view raw
   1/*
   2 * Resizable virtual memory filesystem for Linux.
   3 *
   4 * Copyright (C) 2000 Linus Torvalds.
   5 *		 2000 Transmeta Corp.
   6 *		 2000-2001 Christoph Rohland
   7 *		 2000-2001 SAP AG
   8 *		 2002 Red Hat Inc.
   9 * Copyright (C) 2002-2011 Hugh Dickins.
  10 * Copyright (C) 2011 Google Inc.
  11 * Copyright (C) 2002-2005 VERITAS Software Corporation.
  12 * Copyright (C) 2004 Andi Kleen, SuSE Labs
  13 *
  14 * Extended attribute support for tmpfs:
  15 * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net>
  16 * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
  17 *
  18 * tiny-shmem:
  19 * Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com>
  20 *
  21 * This file is released under the GPL.
  22 */
  23
  24#include <linux/fs.h>
  25#include <linux/init.h>
  26#include <linux/vfs.h>
  27#include <linux/mount.h>
  28#include <linux/ramfs.h>
  29#include <linux/pagemap.h>
  30#include <linux/file.h>
  31#include <linux/fileattr.h>
  32#include <linux/mm.h>
  33#include <linux/random.h>
  34#include <linux/sched/signal.h>
  35#include <linux/export.h>
  36#include <linux/shmem_fs.h>
  37#include <linux/swap.h>
  38#include <linux/uio.h>
  39#include <linux/hugetlb.h>
  40#include <linux/fs_parser.h>
  41#include <linux/swapfile.h>
  42#include <linux/iversion.h>
  43#include <linux/unicode.h>
  44#include "swap.h"
  45
  46static struct vfsmount *shm_mnt __ro_after_init;
  47
  48#ifdef CONFIG_SHMEM
  49/*
  50 * This virtual memory filesystem is heavily based on the ramfs. It
  51 * extends ramfs by the ability to use swap and honor resource limits
  52 * which makes it a completely usable filesystem.
  53 */
  54
  55#include <linux/xattr.h>
  56#include <linux/exportfs.h>
  57#include <linux/posix_acl.h>
  58#include <linux/posix_acl_xattr.h>
  59#include <linux/mman.h>
  60#include <linux/string.h>
  61#include <linux/slab.h>
  62#include <linux/backing-dev.h>
  63#include <linux/writeback.h>
  64#include <linux/pagevec.h>
  65#include <linux/percpu_counter.h>
  66#include <linux/falloc.h>
  67#include <linux/splice.h>
  68#include <linux/security.h>
  69#include <linux/leafops.h>
  70#include <linux/mempolicy.h>
  71#include <linux/namei.h>
  72#include <linux/ctype.h>
  73#include <linux/migrate.h>
  74#include <linux/highmem.h>
  75#include <linux/seq_file.h>
  76#include <linux/magic.h>
  77#include <linux/syscalls.h>
  78#include <linux/fcntl.h>
  79#include <uapi/linux/memfd.h>
  80#include <linux/rmap.h>
  81#include <linux/uuid.h>
  82#include <linux/quotaops.h>
  83#include <linux/rcupdate_wait.h>
  84
  85#include <linux/uaccess.h>
  86
  87#include "internal.h"
  88
  89#define VM_ACCT(size)    (PAGE_ALIGN(size) >> PAGE_SHIFT)
  90
  91/* Pretend that each entry is of this size in directory's i_size */
  92#define BOGO_DIRENT_SIZE 20
  93
  94/* Pretend that one inode + its dentry occupy this much memory */
  95#define BOGO_INODE_SIZE 1024
  96
  97/* Symlink up to this size is kmalloc'ed instead of using a swappable page */
  98#define SHORT_SYMLINK_LEN 128
  99
 100/*
 101 * shmem_fallocate communicates with shmem_fault or shmem_writeout via
 102 * inode->i_private (with i_rwsem making sure that it has only one user at
 103 * a time): we would prefer not to enlarge the shmem inode just for that.
 104 */
 105struct shmem_falloc {
 106	wait_queue_head_t *waitq; /* faults into hole wait for punch to end */
 107	pgoff_t start;		/* start of range currently being fallocated */
 108	pgoff_t next;		/* the next page offset to be fallocated */
 109	pgoff_t nr_falloced;	/* how many new pages have been fallocated */
 110	pgoff_t nr_unswapped;	/* how often writeout refused to swap out */
 111};
 112
 113struct shmem_options {
 114	unsigned long long blocks;
 115	unsigned long long inodes;
 116	struct mempolicy *mpol;
 117	kuid_t uid;
 118	kgid_t gid;
 119	umode_t mode;
 120	bool full_inums;
 121	int huge;
 122	int seen;
 123	bool noswap;
 124	unsigned short quota_types;
 125	struct shmem_quota_limits qlimits;
 126#if IS_ENABLED(CONFIG_UNICODE)
 127	struct unicode_map *encoding;
 128	bool strict_encoding;
 129#endif
 130#define SHMEM_SEEN_BLOCKS 1
 131#define SHMEM_SEEN_INODES 2
 132#define SHMEM_SEEN_HUGE 4
 133#define SHMEM_SEEN_INUMS 8
 134#define SHMEM_SEEN_QUOTA 16
 135};
 136
 137#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 138static unsigned long huge_shmem_orders_always __read_mostly;
 139static unsigned long huge_shmem_orders_madvise __read_mostly;
 140static unsigned long huge_shmem_orders_inherit __read_mostly;
 141static unsigned long huge_shmem_orders_within_size __read_mostly;
 142static bool shmem_orders_configured __initdata;
 143#endif
 144
 145#ifdef CONFIG_TMPFS
 146static unsigned long shmem_default_max_blocks(void)
 147{
 148	return totalram_pages() / 2;
 149}
 150
 151static unsigned long shmem_default_max_inodes(void)
 152{
 153	unsigned long nr_pages = totalram_pages();
 154
 155	return min3(nr_pages - totalhigh_pages(), nr_pages / 2,
 156			ULONG_MAX / BOGO_INODE_SIZE);
 157}
 158#endif
 159
 160static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
 161			struct folio **foliop, enum sgp_type sgp, gfp_t gfp,
 162			struct vm_area_struct *vma, vm_fault_t *fault_type);
 163
 164static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
 165{
 166	return sb->s_fs_info;
 167}
 168
 169/*
 170 * shmem_file_setup pre-accounts the whole fixed size of a VM object,
 171 * for shared memory and for shared anonymous (/dev/zero) mappings
 172 * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1),
 173 * consistent with the pre-accounting of private mappings ...
 174 */
 175static inline int shmem_acct_size(unsigned long flags, loff_t size)
 176{
 177	return (flags & SHMEM_F_NORESERVE) ?
 178		0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size));
 179}
 180
 181static inline void shmem_unacct_size(unsigned long flags, loff_t size)
 182{
 183	if (!(flags & SHMEM_F_NORESERVE))
 184		vm_unacct_memory(VM_ACCT(size));
 185}
 186
 187static inline int shmem_reacct_size(unsigned long flags,
 188		loff_t oldsize, loff_t newsize)
 189{
 190	if (!(flags & SHMEM_F_NORESERVE)) {
 191		if (VM_ACCT(newsize) > VM_ACCT(oldsize))
 192			return security_vm_enough_memory_mm(current->mm,
 193					VM_ACCT(newsize) - VM_ACCT(oldsize));
 194		else if (VM_ACCT(newsize) < VM_ACCT(oldsize))
 195			vm_unacct_memory(VM_ACCT(oldsize) - VM_ACCT(newsize));
 196	}
 197	return 0;
 198}
 199
 200/*
 201 * ... whereas tmpfs objects are accounted incrementally as
 202 * pages are allocated, in order to allow large sparse files.
 203 * shmem_get_folio reports shmem_acct_blocks failure as -ENOSPC not -ENOMEM,
 204 * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
 205 */
 206static inline int shmem_acct_blocks(unsigned long flags, long pages)
 207{
 208	if (!(flags & SHMEM_F_NORESERVE))
 209		return 0;
 210
 211	return security_vm_enough_memory_mm(current->mm,
 212			pages * VM_ACCT(PAGE_SIZE));
 213}
 214
 215static inline void shmem_unacct_blocks(unsigned long flags, long pages)
 216{
 217	if (flags & SHMEM_F_NORESERVE)
 218		vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE));
 219}
 220
 221int shmem_inode_acct_blocks(struct inode *inode, long pages)
 222{
 223	struct shmem_inode_info *info = SHMEM_I(inode);
 224	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
 225	int err = -ENOSPC;
 226
 227	if (shmem_acct_blocks(info->flags, pages))
 228		return err;
 229
 230	might_sleep();	/* when quotas */
 231	if (sbinfo->max_blocks) {
 232		if (!percpu_counter_limited_add(&sbinfo->used_blocks,
 233						sbinfo->max_blocks, pages))
 234			goto unacct;
 235
 236		err = dquot_alloc_block_nodirty(inode, pages);
 237		if (err) {
 238			percpu_counter_sub(&sbinfo->used_blocks, pages);
 239			goto unacct;
 240		}
 241	} else {
 242		err = dquot_alloc_block_nodirty(inode, pages);
 243		if (err)
 244			goto unacct;
 245	}
 246
 247	return 0;
 248
 249unacct:
 250	shmem_unacct_blocks(info->flags, pages);
 251	return err;
 252}
 253
 254static void shmem_inode_unacct_blocks(struct inode *inode, long pages)
 255{
 256	struct shmem_inode_info *info = SHMEM_I(inode);
 257	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
 258
 259	might_sleep();	/* when quotas */
 260	dquot_free_block_nodirty(inode, pages);
 261
 262	if (sbinfo->max_blocks)
 263		percpu_counter_sub(&sbinfo->used_blocks, pages);
 264	shmem_unacct_blocks(info->flags, pages);
 265}
 266
 267static const struct super_operations shmem_ops;
 268static const struct address_space_operations shmem_aops;
 269static const struct file_operations shmem_file_operations;
 270static const struct inode_operations shmem_inode_operations;
 271static const struct inode_operations shmem_dir_inode_operations;
 272static const struct inode_operations shmem_special_inode_operations;
 273static const struct vm_operations_struct shmem_vm_ops;
 274static const struct vm_operations_struct shmem_anon_vm_ops;
 275static struct file_system_type shmem_fs_type;
 276
 277bool shmem_mapping(const struct address_space *mapping)
 278{
 279	return mapping->a_ops == &shmem_aops;
 280}
 281EXPORT_SYMBOL_GPL(shmem_mapping);
 282
 283bool vma_is_anon_shmem(const struct vm_area_struct *vma)
 284{
 285	return vma->vm_ops == &shmem_anon_vm_ops;
 286}
 287
 288bool vma_is_shmem(const struct vm_area_struct *vma)
 289{
 290	return vma_is_anon_shmem(vma) || vma->vm_ops == &shmem_vm_ops;
 291}
 292
 293static LIST_HEAD(shmem_swaplist);
 294static DEFINE_SPINLOCK(shmem_swaplist_lock);
 295
 296#ifdef CONFIG_TMPFS_QUOTA
 297
 298static int shmem_enable_quotas(struct super_block *sb,
 299			       unsigned short quota_types)
 300{
 301	int type, err = 0;
 302
 303	sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NOLIST_DIRTY;
 304	for (type = 0; type < SHMEM_MAXQUOTAS; type++) {
 305		if (!(quota_types & (1 << type)))
 306			continue;
 307		err = dquot_load_quota_sb(sb, type, QFMT_SHMEM,
 308					  DQUOT_USAGE_ENABLED |
 309					  DQUOT_LIMITS_ENABLED);
 310		if (err)
 311			goto out_err;
 312	}
 313	return 0;
 314
 315out_err:
 316	pr_warn("tmpfs: failed to enable quota tracking (type=%d, err=%d)\n",
 317		type, err);
 318	for (type--; type >= 0; type--)
 319		dquot_quota_off(sb, type);
 320	return err;
 321}
 322
 323static void shmem_disable_quotas(struct super_block *sb)
 324{
 325	int type;
 326
 327	for (type = 0; type < SHMEM_MAXQUOTAS; type++)
 328		dquot_quota_off(sb, type);
 329}
 330
 331static struct dquot __rcu **shmem_get_dquots(struct inode *inode)
 332{
 333	return SHMEM_I(inode)->i_dquot;
 334}
 335#endif /* CONFIG_TMPFS_QUOTA */
 336
 337/*
 338 * shmem_reserve_inode() performs bookkeeping to reserve a shmem inode, and
 339 * produces a novel ino for the newly allocated inode.
 340 *
 341 * It may also be called when making a hard link to permit the space needed by
 342 * each dentry. However, in that case, no new inode number is needed since that
 343 * internally draws from another pool of inode numbers (currently global
 344 * get_next_ino()). This case is indicated by passing NULL as inop.
 345 */
 346#define SHMEM_INO_BATCH 1024
 347static int shmem_reserve_inode(struct super_block *sb, ino_t *inop)
 348{
 349	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
 350	ino_t ino;
 351
 352	if (!(sb->s_flags & SB_KERNMOUNT)) {
 353		raw_spin_lock(&sbinfo->stat_lock);
 354		if (sbinfo->max_inodes) {
 355			if (sbinfo->free_ispace < BOGO_INODE_SIZE) {
 356				raw_spin_unlock(&sbinfo->stat_lock);
 357				return -ENOSPC;
 358			}
 359			sbinfo->free_ispace -= BOGO_INODE_SIZE;
 360		}
 361		if (inop) {
 362			ino = sbinfo->next_ino++;
 363			if (unlikely(is_zero_ino(ino)))
 364				ino = sbinfo->next_ino++;
 365			if (unlikely(!sbinfo->full_inums &&
 366				     ino > UINT_MAX)) {
 367				/*
 368				 * Emulate get_next_ino uint wraparound for
 369				 * compatibility
 370				 */
 371				if (IS_ENABLED(CONFIG_64BIT))
 372					pr_warn("%s: inode number overflow on device %d, consider using inode64 mount option\n",
 373						__func__, MINOR(sb->s_dev));
 374				sbinfo->next_ino = 1;
 375				ino = sbinfo->next_ino++;
 376			}
 377			*inop = ino;
 378		}
 379		raw_spin_unlock(&sbinfo->stat_lock);
 380	} else if (inop) {
 381		/*
 382		 * __shmem_file_setup, one of our callers, is lock-free: it
 383		 * doesn't hold stat_lock in shmem_reserve_inode since
 384		 * max_inodes is always 0, and is called from potentially
 385		 * unknown contexts. As such, use a per-cpu batched allocator
 386		 * which doesn't require the per-sb stat_lock unless we are at
 387		 * the batch boundary.
 388		 *
 389		 * We don't need to worry about inode{32,64} since SB_KERNMOUNT
 390		 * shmem mounts are not exposed to userspace, so we don't need
 391		 * to worry about things like glibc compatibility.
 392		 */
 393		ino_t *next_ino;
 394
 395		next_ino = per_cpu_ptr(sbinfo->ino_batch, get_cpu());
 396		ino = *next_ino;
 397		if (unlikely(ino % SHMEM_INO_BATCH == 0)) {
 398			raw_spin_lock(&sbinfo->stat_lock);
 399			ino = sbinfo->next_ino;
 400			sbinfo->next_ino += SHMEM_INO_BATCH;
 401			raw_spin_unlock(&sbinfo->stat_lock);
 402			if (unlikely(is_zero_ino(ino)))
 403				ino++;
 404		}
 405		*inop = ino;
 406		*next_ino = ++ino;
 407		put_cpu();
 408	}
 409
 410	return 0;
 411}
 412
 413static void shmem_free_inode(struct super_block *sb, size_t freed_ispace)
 414{
 415	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
 416	if (sbinfo->max_inodes) {
 417		raw_spin_lock(&sbinfo->stat_lock);
 418		sbinfo->free_ispace += BOGO_INODE_SIZE + freed_ispace;
 419		raw_spin_unlock(&sbinfo->stat_lock);
 420	}
 421}
 422
 423/**
 424 * shmem_recalc_inode - recalculate the block usage of an inode
 425 * @inode: inode to recalc
 426 * @alloced: the change in number of pages allocated to inode
 427 * @swapped: the change in number of pages swapped from inode
 428 *
 429 * We have to calculate the free blocks since the mm can drop
 430 * undirtied hole pages behind our back.
 431 *
 432 * But normally   info->alloced == inode->i_mapping->nrpages + info->swapped
 433 * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped)
 434 *
 435 * Return: true if swapped was incremented from 0, for shmem_writeout().
 436 */
 437bool shmem_recalc_inode(struct inode *inode, long alloced, long swapped)
 438{
 439	struct shmem_inode_info *info = SHMEM_I(inode);
 440	bool first_swapped = false;
 441	long freed;
 442
 443	spin_lock(&info->lock);
 444	info->alloced += alloced;
 445	info->swapped += swapped;
 446	freed = info->alloced - info->swapped -
 447		READ_ONCE(inode->i_mapping->nrpages);
 448	/*
 449	 * Special case: whereas normally shmem_recalc_inode() is called
 450	 * after i_mapping->nrpages has already been adjusted (up or down),
 451	 * shmem_writeout() has to raise swapped before nrpages is lowered -
 452	 * to stop a racing shmem_recalc_inode() from thinking that a page has
 453	 * been freed.  Compensate here, to avoid the need for a followup call.
 454	 */
 455	if (swapped > 0) {
 456		if (info->swapped == swapped)
 457			first_swapped = true;
 458		freed += swapped;
 459	}
 460	if (freed > 0)
 461		info->alloced -= freed;
 462	spin_unlock(&info->lock);
 463
 464	/* The quota case may block */
 465	if (freed > 0)
 466		shmem_inode_unacct_blocks(inode, freed);
 467	return first_swapped;
 468}
 469
 470bool shmem_charge(struct inode *inode, long pages)
 471{
 472	struct address_space *mapping = inode->i_mapping;
 473
 474	if (shmem_inode_acct_blocks(inode, pages))
 475		return false;
 476
 477	/* nrpages adjustment first, then shmem_recalc_inode() when balanced */
 478	xa_lock_irq(&mapping->i_pages);
 479	mapping->nrpages += pages;
 480	xa_unlock_irq(&mapping->i_pages);
 481
 482	shmem_recalc_inode(inode, pages, 0);
 483	return true;
 484}
 485
 486void shmem_uncharge(struct inode *inode, long pages)
 487{
 488	/* pages argument is currently unused: keep it to help debugging */
 489	/* nrpages adjustment done by __filemap_remove_folio() or caller */
 490
 491	shmem_recalc_inode(inode, 0, 0);
 492}
 493
 494/*
 495 * Replace item expected in xarray by a new item, while holding xa_lock.
 496 */
 497static int shmem_replace_entry(struct address_space *mapping,
 498			pgoff_t index, void *expected, void *replacement)
 499{
 500	XA_STATE(xas, &mapping->i_pages, index);
 501	void *item;
 502
 503	VM_BUG_ON(!expected);
 504	VM_BUG_ON(!replacement);
 505	item = xas_load(&xas);
 506	if (item != expected)
 507		return -ENOENT;
 508	xas_store(&xas, replacement);
 509	return 0;
 510}
 511
 512/*
 513 * Sometimes, before we decide whether to proceed or to fail, we must check
 514 * that an entry was not already brought back or split by a racing thread.
 515 *
 516 * Checking folio is not enough: by the time a swapcache folio is locked, it
 517 * might be reused, and again be swapcache, using the same swap as before.
 518 * Returns the swap entry's order if it still presents, else returns -1.
 519 */
 520static int shmem_confirm_swap(struct address_space *mapping, pgoff_t index,
 521			      swp_entry_t swap)
 522{
 523	XA_STATE(xas, &mapping->i_pages, index);
 524	int ret = -1;
 525	void *entry;
 526
 527	rcu_read_lock();
 528	do {
 529		entry = xas_load(&xas);
 530		if (entry == swp_to_radix_entry(swap))
 531			ret = xas_get_order(&xas);
 532	} while (xas_retry(&xas, entry));
 533	rcu_read_unlock();
 534	return ret;
 535}
 536
 537/*
 538 * Definitions for "huge tmpfs": tmpfs mounted with the huge= option
 539 *
 540 * SHMEM_HUGE_NEVER:
 541 *	disables huge pages for the mount;
 542 * SHMEM_HUGE_ALWAYS:
 543 *	enables huge pages for the mount;
 544 * SHMEM_HUGE_WITHIN_SIZE:
 545 *	only allocate huge pages if the page will be fully within i_size,
 546 *	also respect madvise() hints;
 547 * SHMEM_HUGE_ADVISE:
 548 *	only allocate huge pages if requested with madvise();
 549 */
 550
 551#define SHMEM_HUGE_NEVER	0
 552#define SHMEM_HUGE_ALWAYS	1
 553#define SHMEM_HUGE_WITHIN_SIZE	2
 554#define SHMEM_HUGE_ADVISE	3
 555
 556/*
 557 * Special values.
 558 * Only can be set via /sys/kernel/mm/transparent_hugepage/shmem_enabled:
 559 *
 560 * SHMEM_HUGE_DENY:
 561 *	disables huge on shm_mnt and all mounts, for emergency use;
 562 * SHMEM_HUGE_FORCE:
 563 *	enables huge on shm_mnt and all mounts, w/o needing option, for testing;
 564 *
 565 */
 566#define SHMEM_HUGE_DENY		(-1)
 567#define SHMEM_HUGE_FORCE	(-2)
 568
 569#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 570/* ifdef here to avoid bloating shmem.o when not necessary */
 571
 572#if defined(CONFIG_TRANSPARENT_HUGEPAGE_SHMEM_HUGE_NEVER)
 573#define SHMEM_HUGE_DEFAULT SHMEM_HUGE_NEVER
 574#elif defined(CONFIG_TRANSPARENT_HUGEPAGE_SHMEM_HUGE_ALWAYS)
 575#define SHMEM_HUGE_DEFAULT SHMEM_HUGE_ALWAYS
 576#elif defined(CONFIG_TRANSPARENT_HUGEPAGE_SHMEM_HUGE_WITHIN_SIZE)
 577#define SHMEM_HUGE_DEFAULT SHMEM_HUGE_WITHIN_SIZE
 578#elif defined(CONFIG_TRANSPARENT_HUGEPAGE_SHMEM_HUGE_ADVISE)
 579#define SHMEM_HUGE_DEFAULT SHMEM_HUGE_ADVISE
 580#else
 581#define SHMEM_HUGE_DEFAULT SHMEM_HUGE_NEVER
 582#endif
 583
 584static int shmem_huge __read_mostly = SHMEM_HUGE_DEFAULT;
 585
 586#undef SHMEM_HUGE_DEFAULT
 587
 588#if defined(CONFIG_TRANSPARENT_HUGEPAGE_TMPFS_HUGE_NEVER)
 589#define TMPFS_HUGE_DEFAULT SHMEM_HUGE_NEVER
 590#elif defined(CONFIG_TRANSPARENT_HUGEPAGE_TMPFS_HUGE_ALWAYS)
 591#define TMPFS_HUGE_DEFAULT SHMEM_HUGE_ALWAYS
 592#elif defined(CONFIG_TRANSPARENT_HUGEPAGE_TMPFS_HUGE_WITHIN_SIZE)
 593#define TMPFS_HUGE_DEFAULT SHMEM_HUGE_WITHIN_SIZE
 594#elif defined(CONFIG_TRANSPARENT_HUGEPAGE_TMPFS_HUGE_ADVISE)
 595#define TMPFS_HUGE_DEFAULT SHMEM_HUGE_ADVISE
 596#else
 597#define TMPFS_HUGE_DEFAULT SHMEM_HUGE_NEVER
 598#endif
 599
 600static int tmpfs_huge __read_mostly = TMPFS_HUGE_DEFAULT;
 601
 602#undef TMPFS_HUGE_DEFAULT
 603
 604static unsigned int shmem_get_orders_within_size(struct inode *inode,
 605		unsigned long within_size_orders, pgoff_t index,
 606		loff_t write_end)
 607{
 608	pgoff_t aligned_index;
 609	unsigned long order;
 610	loff_t i_size;
 611
 612	order = highest_order(within_size_orders);
 613	while (within_size_orders) {
 614		aligned_index = round_up(index + 1, 1 << order);
 615		i_size = max(write_end, i_size_read(inode));
 616		i_size = round_up(i_size, PAGE_SIZE);
 617		if (i_size >> PAGE_SHIFT >= aligned_index)
 618			return within_size_orders;
 619
 620		order = next_order(&within_size_orders, order);
 621	}
 622
 623	return 0;
 624}
 625
 626static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index,
 627					      loff_t write_end, bool shmem_huge_force,
 628					      struct vm_area_struct *vma,
 629					      vm_flags_t vm_flags)
 630{
 631	unsigned int maybe_pmd_order = HPAGE_PMD_ORDER > MAX_PAGECACHE_ORDER ?
 632		0 : BIT(HPAGE_PMD_ORDER);
 633	unsigned long within_size_orders;
 634
 635	if (!S_ISREG(inode->i_mode))
 636		return 0;
 637	if (shmem_huge == SHMEM_HUGE_DENY)
 638		return 0;
 639	if (shmem_huge_force || shmem_huge == SHMEM_HUGE_FORCE)
 640		return maybe_pmd_order;
 641
 642	/*
 643	 * The huge order allocation for anon shmem is controlled through
 644	 * the mTHP interface, so we still use PMD-sized huge order to
 645	 * check whether global control is enabled.
 646	 *
 647	 * For tmpfs with 'huge=always' or 'huge=within_size' mount option,
 648	 * we will always try PMD-sized order first. If that failed, it will
 649	 * fall back to small large folios.
 650	 */
 651	switch (SHMEM_SB(inode->i_sb)->huge) {
 652	case SHMEM_HUGE_ALWAYS:
 653		return THP_ORDERS_ALL_FILE_DEFAULT;
 654	case SHMEM_HUGE_WITHIN_SIZE:
 655		within_size_orders = shmem_get_orders_within_size(inode,
 656				THP_ORDERS_ALL_FILE_DEFAULT, index, write_end);
 657		if (within_size_orders > 0)
 658			return within_size_orders;
 659
 660		fallthrough;
 661	case SHMEM_HUGE_ADVISE:
 662		if (vm_flags & VM_HUGEPAGE)
 663			return THP_ORDERS_ALL_FILE_DEFAULT;
 664		fallthrough;
 665	default:
 666		return 0;
 667	}
 668}
 669
 670static int shmem_parse_huge(const char *str)
 671{
 672	int huge;
 673
 674	if (!str)
 675		return -EINVAL;
 676
 677	if (!strcmp(str, "never"))
 678		huge = SHMEM_HUGE_NEVER;
 679	else if (!strcmp(str, "always"))
 680		huge = SHMEM_HUGE_ALWAYS;
 681	else if (!strcmp(str, "within_size"))
 682		huge = SHMEM_HUGE_WITHIN_SIZE;
 683	else if (!strcmp(str, "advise"))
 684		huge = SHMEM_HUGE_ADVISE;
 685	else if (!strcmp(str, "deny"))
 686		huge = SHMEM_HUGE_DENY;
 687	else if (!strcmp(str, "force"))
 688		huge = SHMEM_HUGE_FORCE;
 689	else
 690		return -EINVAL;
 691
 692	if (!has_transparent_hugepage() &&
 693	    huge != SHMEM_HUGE_NEVER && huge != SHMEM_HUGE_DENY)
 694		return -EINVAL;
 695
 696	/* Do not override huge allocation policy with non-PMD sized mTHP */
 697	if (huge == SHMEM_HUGE_FORCE &&
 698	    huge_shmem_orders_inherit != BIT(HPAGE_PMD_ORDER))
 699		return -EINVAL;
 700
 701	return huge;
 702}
 703
 704#if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS)
 705static const char *shmem_format_huge(int huge)
 706{
 707	switch (huge) {
 708	case SHMEM_HUGE_NEVER:
 709		return "never";
 710	case SHMEM_HUGE_ALWAYS:
 711		return "always";
 712	case SHMEM_HUGE_WITHIN_SIZE:
 713		return "within_size";
 714	case SHMEM_HUGE_ADVISE:
 715		return "advise";
 716	case SHMEM_HUGE_DENY:
 717		return "deny";
 718	case SHMEM_HUGE_FORCE:
 719		return "force";
 720	default:
 721		VM_BUG_ON(1);
 722		return "bad_val";
 723	}
 724}
 725#endif
 726
 727static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
 728		struct shrink_control *sc, unsigned long nr_to_free)
 729{
 730	LIST_HEAD(list), *pos, *next;
 731	struct inode *inode;
 732	struct shmem_inode_info *info;
 733	struct folio *folio;
 734	unsigned long batch = sc ? sc->nr_to_scan : 128;
 735	unsigned long split = 0, freed = 0;
 736
 737	if (list_empty(&sbinfo->shrinklist))
 738		return SHRINK_STOP;
 739
 740	spin_lock(&sbinfo->shrinklist_lock);
 741	list_for_each_safe(pos, next, &sbinfo->shrinklist) {
 742		info = list_entry(pos, struct shmem_inode_info, shrinklist);
 743
 744		/* pin the inode */
 745		inode = igrab(&info->vfs_inode);
 746
 747		/* inode is about to be evicted */
 748		if (!inode) {
 749			list_del_init(&info->shrinklist);
 750			goto next;
 751		}
 752
 753		list_move(&info->shrinklist, &list);
 754next:
 755		sbinfo->shrinklist_len--;
 756		if (!--batch)
 757			break;
 758	}
 759	spin_unlock(&sbinfo->shrinklist_lock);
 760
 761	list_for_each_safe(pos, next, &list) {
 762		pgoff_t next, end;
 763		loff_t i_size;
 764		int ret;
 765
 766		info = list_entry(pos, struct shmem_inode_info, shrinklist);
 767		inode = &info->vfs_inode;
 768
 769		if (nr_to_free && freed >= nr_to_free)
 770			goto move_back;
 771
 772		i_size = i_size_read(inode);
 773		folio = filemap_get_entry(inode->i_mapping, i_size / PAGE_SIZE);
 774		if (!folio || xa_is_value(folio))
 775			goto drop;
 776
 777		/* No large folio at the end of the file: nothing to split */
 778		if (!folio_test_large(folio)) {
 779			folio_put(folio);
 780			goto drop;
 781		}
 782
 783		/* Check if there is anything to gain from splitting */
 784		next = folio_next_index(folio);
 785		end = shmem_fallocend(inode, DIV_ROUND_UP(i_size, PAGE_SIZE));
 786		if (end <= folio->index || end >= next) {
 787			folio_put(folio);
 788			goto drop;
 789		}
 790
 791		/*
 792		 * Move the inode on the list back to shrinklist if we failed
 793		 * to lock the page at this time.
 794		 *
 795		 * Waiting for the lock may lead to deadlock in the
 796		 * reclaim path.
 797		 */
 798		if (!folio_trylock(folio)) {
 799			folio_put(folio);
 800			goto move_back;
 801		}
 802
 803		ret = split_folio(folio);
 804		folio_unlock(folio);
 805		folio_put(folio);
 806
 807		/* If split failed move the inode on the list back to shrinklist */
 808		if (ret)
 809			goto move_back;
 810
 811		freed += next - end;
 812		split++;
 813drop:
 814		list_del_init(&info->shrinklist);
 815		goto put;
 816move_back:
 817		/*
 818		 * Make sure the inode is either on the global list or deleted
 819		 * from any local list before iput() since it could be deleted
 820		 * in another thread once we put the inode (then the local list
 821		 * is corrupted).
 822		 */
 823		spin_lock(&sbinfo->shrinklist_lock);
 824		list_move(&info->shrinklist, &sbinfo->shrinklist);
 825		sbinfo->shrinklist_len++;
 826		spin_unlock(&sbinfo->shrinklist_lock);
 827put:
 828		iput(inode);
 829	}
 830
 831	return split;
 832}
 833
 834static long shmem_unused_huge_scan(struct super_block *sb,
 835		struct shrink_control *sc)
 836{
 837	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
 838
 839	if (!READ_ONCE(sbinfo->shrinklist_len))
 840		return SHRINK_STOP;
 841
 842	return shmem_unused_huge_shrink(sbinfo, sc, 0);
 843}
 844
 845static long shmem_unused_huge_count(struct super_block *sb,
 846		struct shrink_control *sc)
 847{
 848	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
 849	return READ_ONCE(sbinfo->shrinklist_len);
 850}
 851#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
 852
 853#define shmem_huge SHMEM_HUGE_DENY
 854
 855static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
 856		struct shrink_control *sc, unsigned long nr_to_free)
 857{
 858	return 0;
 859}
 860
 861static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index,
 862					      loff_t write_end, bool shmem_huge_force,
 863					      struct vm_area_struct *vma,
 864					      vm_flags_t vm_flags)
 865{
 866	return 0;
 867}
 868#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 869
 870static void shmem_update_stats(struct folio *folio, int nr_pages)
 871{
 872	if (folio_test_pmd_mappable(folio))
 873		lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr_pages);
 874	lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages);
 875	lruvec_stat_mod_folio(folio, NR_SHMEM, nr_pages);
 876}
 877
 878/*
 879 * Somewhat like filemap_add_folio, but error if expected item has gone.
 880 */
 881int shmem_add_to_page_cache(struct folio *folio,
 882			    struct address_space *mapping,
 883			    pgoff_t index, void *expected, gfp_t gfp)
 884{
 885	XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio));
 886	unsigned long nr = folio_nr_pages(folio);
 887	swp_entry_t iter, swap;
 888	void *entry;
 889
 890	VM_BUG_ON_FOLIO(index != round_down(index, nr), folio);
 891	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
 892	VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio);
 893
 894	folio_ref_add(folio, nr);
 895	folio->mapping = mapping;
 896	folio->index = index;
 897
 898	gfp &= GFP_RECLAIM_MASK;
 899	folio_throttle_swaprate(folio, gfp);
 900	swap = radix_to_swp_entry(expected);
 901
 902	do {
 903		iter = swap;
 904		xas_lock_irq(&xas);
 905		xas_for_each_conflict(&xas, entry) {
 906			/*
 907			 * The range must either be empty, or filled with
 908			 * expected swap entries. Shmem swap entries are never
 909			 * partially freed without split of both entry and
 910			 * folio, so there shouldn't be any holes.
 911			 */
 912			if (!expected || entry != swp_to_radix_entry(iter)) {
 913				xas_set_err(&xas, -EEXIST);
 914				goto unlock;
 915			}
 916			iter.val += 1 << xas_get_order(&xas);
 917		}
 918		if (expected && iter.val - nr != swap.val) {
 919			xas_set_err(&xas, -EEXIST);
 920			goto unlock;
 921		}
 922		xas_store(&xas, folio);
 923		if (xas_error(&xas))
 924			goto unlock;
 925		shmem_update_stats(folio, nr);
 926		mapping->nrpages += nr;
 927unlock:
 928		xas_unlock_irq(&xas);
 929	} while (xas_nomem(&xas, gfp));
 930
 931	if (xas_error(&xas)) {
 932		folio->mapping = NULL;
 933		folio_ref_sub(folio, nr);
 934		return xas_error(&xas);
 935	}
 936
 937	return 0;
 938}
 939
 940/*
 941 * Somewhat like filemap_remove_folio, but substitutes swap for @folio.
 942 */
 943static void shmem_delete_from_page_cache(struct folio *folio, void *radswap)
 944{
 945	struct address_space *mapping = folio->mapping;
 946	long nr = folio_nr_pages(folio);
 947	int error;
 948
 949	xa_lock_irq(&mapping->i_pages);
 950	error = shmem_replace_entry(mapping, folio->index, folio, radswap);
 951	folio->mapping = NULL;
 952	mapping->nrpages -= nr;
 953	shmem_update_stats(folio, -nr);
 954	xa_unlock_irq(&mapping->i_pages);
 955	folio_put_refs(folio, nr);
 956	BUG_ON(error);
 957}
 958
 959/*
 960 * Remove swap entry from page cache, free the swap and its page cache. Returns
 961 * the number of pages being freed. 0 means entry not found in XArray (0 pages
 962 * being freed).
 963 */
 964static long shmem_free_swap(struct address_space *mapping,
 965			    pgoff_t index, pgoff_t end, void *radswap)
 966{
 967	XA_STATE(xas, &mapping->i_pages, index);
 968	unsigned int nr_pages = 0;
 969	pgoff_t base;
 970	void *entry;
 971
 972	xas_lock_irq(&xas);
 973	entry = xas_load(&xas);
 974	if (entry == radswap) {
 975		nr_pages = 1 << xas_get_order(&xas);
 976		base = round_down(xas.xa_index, nr_pages);
 977		if (base < index || base + nr_pages - 1 > end)
 978			nr_pages = 0;
 979		else
 980			xas_store(&xas, NULL);
 981	}
 982	xas_unlock_irq(&xas);
 983
 984	if (nr_pages)
 985		free_swap_and_cache_nr(radix_to_swp_entry(radswap), nr_pages);
 986
 987	return nr_pages;
 988}
 989
 990/*
 991 * Determine (in bytes) how many of the shmem object's pages mapped by the
 992 * given offsets are swapped out.
 993 *
 994 * This is safe to call without i_rwsem or the i_pages lock thanks to RCU,
 995 * as long as the inode doesn't go away and racy results are not a problem.
 996 */
 997unsigned long shmem_partial_swap_usage(struct address_space *mapping,
 998						pgoff_t start, pgoff_t end)
 999{
1000	XA_STATE(xas, &mapping->i_pages, start);
1001	struct folio *folio;
1002	unsigned long swapped = 0;
1003	unsigned long max = end - 1;
1004
1005	rcu_read_lock();
1006	xas_for_each(&xas, folio, max) {
1007		if (xas_retry(&xas, folio))
1008			continue;
1009		if (xa_is_value(folio))
1010			swapped += 1 << xas_get_order(&xas);
1011		if (xas.xa_index == max)
1012			break;
1013		if (need_resched()) {
1014			xas_pause(&xas);
1015			cond_resched_rcu();
1016		}
1017	}
1018	rcu_read_unlock();
1019
1020	return swapped << PAGE_SHIFT;
1021}
1022
1023/*
1024 * Determine (in bytes) how many of the shmem object's pages mapped by the
1025 * given vma is swapped out.
1026 *
1027 * This is safe to call without i_rwsem or the i_pages lock thanks to RCU,
1028 * as long as the inode doesn't go away and racy results are not a problem.
1029 */
1030unsigned long shmem_swap_usage(struct vm_area_struct *vma)
1031{
1032	struct inode *inode = file_inode(vma->vm_file);
1033	struct shmem_inode_info *info = SHMEM_I(inode);
1034	struct address_space *mapping = inode->i_mapping;
1035	unsigned long swapped;
1036
1037	/* Be careful as we don't hold info->lock */
1038	swapped = READ_ONCE(info->swapped);
1039
1040	/*
1041	 * The easier cases are when the shmem object has nothing in swap, or
1042	 * the vma maps it whole. Then we can simply use the stats that we
1043	 * already track.
1044	 */
1045	if (!swapped)
1046		return 0;
1047
1048	if (!vma->vm_pgoff && vma->vm_end - vma->vm_start >= inode->i_size)
1049		return swapped << PAGE_SHIFT;
1050
1051	/* Here comes the more involved part */
1052	return shmem_partial_swap_usage(mapping, vma->vm_pgoff,
1053					vma->vm_pgoff + vma_pages(vma));
1054}
1055
1056/*
1057 * SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists.
1058 */
1059void shmem_unlock_mapping(struct address_space *mapping)
1060{
1061	struct folio_batch fbatch;
1062	pgoff_t index = 0;
1063
1064	folio_batch_init(&fbatch);
1065	/*
1066	 * Minor point, but we might as well stop if someone else SHM_LOCKs it.
1067	 */
1068	while (!mapping_unevictable(mapping) &&
1069	       filemap_get_folios(mapping, &index, ~0UL, &fbatch)) {
1070		check_move_unevictable_folios(&fbatch);
1071		folio_batch_release(&fbatch);
1072		cond_resched();
1073	}
1074}
1075
1076static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index)
1077{
1078	struct folio *folio;
1079
1080	/*
1081	 * At first avoid shmem_get_folio(,,,SGP_READ): that fails
1082	 * beyond i_size, and reports fallocated folios as holes.
1083	 */
1084	folio = filemap_get_entry(inode->i_mapping, index);
1085	if (!folio)
1086		return folio;
1087	if (!xa_is_value(folio)) {
1088		folio_lock(folio);
1089		if (folio->mapping == inode->i_mapping)
1090			return folio;
1091		/* The folio has been swapped out */
1092		folio_unlock(folio);
1093		folio_put(folio);
1094	}
1095	/*
1096	 * But read a folio back from swap if any of it is within i_size
1097	 * (although in some cases this is just a waste of time).
1098	 */
1099	folio = NULL;
1100	shmem_get_folio(inode, index, 0, &folio, SGP_READ);
1101	return folio;
1102}
1103
1104/*
1105 * Remove range of pages and swap entries from page cache, and free them.
1106 * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
1107 */
1108static void shmem_undo_range(struct inode *inode, loff_t lstart, uoff_t lend,
1109								 bool unfalloc)
1110{
1111	struct address_space *mapping = inode->i_mapping;
1112	struct shmem_inode_info *info = SHMEM_I(inode);
1113	pgoff_t start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT;
1114	pgoff_t end = (lend + 1) >> PAGE_SHIFT;
1115	struct folio_batch fbatch;
1116	pgoff_t indices[PAGEVEC_SIZE];
1117	struct folio *folio;
1118	bool same_folio;
1119	long nr_swaps_freed = 0;
1120	pgoff_t index;
1121	int i;
1122
1123	if (lend == -1)
1124		end = -1;	/* unsigned, so actually very big */
1125
1126	if (info->fallocend > start && info->fallocend <= end && !unfalloc)
1127		info->fallocend = start;
1128
1129	folio_batch_init(&fbatch);
1130	index = start;
1131	while (index < end && find_lock_entries(mapping, &index, end - 1,
1132			&fbatch, indices)) {
1133		for (i = 0; i < folio_batch_count(&fbatch); i++) {
1134			folio = fbatch.folios[i];
1135
1136			if (xa_is_value(folio)) {
1137				if (unfalloc)
1138					continue;
1139				nr_swaps_freed += shmem_free_swap(mapping, indices[i],
1140								  end - 1, folio);
1141				continue;
1142			}
1143
1144			if (!unfalloc || !folio_test_uptodate(folio))
1145				truncate_inode_folio(mapping, folio);
1146			folio_unlock(folio);
1147		}
1148		folio_batch_remove_exceptionals(&fbatch);
1149		folio_batch_release(&fbatch);
1150		cond_resched();
1151	}
1152
1153	/*
1154	 * When undoing a failed fallocate, we want none of the partial folio
1155	 * zeroing and splitting below, but shall want to truncate the whole
1156	 * folio when !uptodate indicates that it was added by this fallocate,
1157	 * even when [lstart, lend] covers only a part of the folio.
1158	 */
1159	if (unfalloc)
1160		goto whole_folios;
1161
1162	same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT);
1163	folio = shmem_get_partial_folio(inode, lstart >> PAGE_SHIFT);
1164	if (folio) {
1165		same_folio = lend < folio_next_pos(folio);
1166		folio_mark_dirty(folio);
1167		if (!truncate_inode_partial_folio(folio, lstart, lend)) {
1168			start = folio_next_index(folio);
1169			if (same_folio)
1170				end = folio->index;
1171		}
1172		folio_unlock(folio);
1173		folio_put(folio);
1174		folio = NULL;
1175	}
1176
1177	if (!same_folio)
1178		folio = shmem_get_partial_folio(inode, lend >> PAGE_SHIFT);
1179	if (folio) {
1180		folio_mark_dirty(folio);
1181		if (!truncate_inode_partial_folio(folio, lstart, lend))
1182			end = folio->index;
1183		folio_unlock(folio);
1184		folio_put(folio);
1185	}
1186
1187whole_folios:
1188
1189	index = start;
1190	while (index < end) {
1191		cond_resched();
1192
1193		if (!find_get_entries(mapping, &index, end - 1, &fbatch,
1194				indices)) {
1195			/* If all gone or hole-punch or unfalloc, we're done */
1196			if (index == start || end != -1)
1197				break;
1198			/* But if truncating, restart to make sure all gone */
1199			index = start;
1200			continue;
1201		}
1202		for (i = 0; i < folio_batch_count(&fbatch); i++) {
1203			folio = fbatch.folios[i];
1204
1205			if (xa_is_value(folio)) {
1206				int order;
1207				long swaps_freed;
1208
1209				if (unfalloc)
1210					continue;
1211				swaps_freed = shmem_free_swap(mapping, indices[i],
1212							      end - 1, folio);
1213				if (!swaps_freed) {
1214					pgoff_t base = indices[i];
1215
1216					order = shmem_confirm_swap(mapping, indices[i],
1217								   radix_to_swp_entry(folio));
1218					/*
1219					 * If found a large swap entry cross the end or start
1220					 * border, skip it as the truncate_inode_partial_folio
1221					 * above should have at least zerod its content once.
1222					 */
1223					if (order > 0) {
1224						base = round_down(base, 1 << order);
1225						if (base < start || base + (1 << order) > end)
1226							continue;
1227					}
1228					/* Swap was replaced by page or extended, retry */
1229					index = base;
1230					break;
1231				}
1232				nr_swaps_freed += swaps_freed;
1233				continue;
1234			}
1235
1236			folio_lock(folio);
1237
1238			if (!unfalloc || !folio_test_uptodate(folio)) {
1239				if (folio_mapping(folio) != mapping) {
1240					/* Page was replaced by swap: retry */
1241					folio_unlock(folio);
1242					index = indices[i];
1243					break;
1244				}
1245				VM_BUG_ON_FOLIO(folio_test_writeback(folio),
1246						folio);
1247
1248				if (!folio_test_large(folio)) {
1249					truncate_inode_folio(mapping, folio);
1250				} else if (truncate_inode_partial_folio(folio, lstart, lend)) {
1251					/*
1252					 * If we split a page, reset the loop so
1253					 * that we pick up the new sub pages.
1254					 * Otherwise the THP was entirely
1255					 * dropped or the target range was
1256					 * zeroed, so just continue the loop as
1257					 * is.
1258					 */
1259					if (!folio_test_large(folio)) {
1260						folio_unlock(folio);
1261						index = start;
1262						break;
1263					}
1264				}
1265			}
1266			folio_unlock(folio);
1267		}
1268		folio_batch_remove_exceptionals(&fbatch);
1269		folio_batch_release(&fbatch);
1270	}
1271
1272	shmem_recalc_inode(inode, 0, -nr_swaps_freed);
1273}
1274
1275void shmem_truncate_range(struct inode *inode, loff_t lstart, uoff_t lend)
1276{
1277	shmem_undo_range(inode, lstart, lend, false);
1278	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
1279	inode_inc_iversion(inode);
1280}
1281EXPORT_SYMBOL_GPL(shmem_truncate_range);
1282
1283static int shmem_getattr(struct mnt_idmap *idmap,
1284			 const struct path *path, struct kstat *stat,
1285			 u32 request_mask, unsigned int query_flags)
1286{
1287	struct inode *inode = path->dentry->d_inode;
1288	struct shmem_inode_info *info = SHMEM_I(inode);
1289
1290	if (info->alloced - info->swapped != inode->i_mapping->nrpages)
1291		shmem_recalc_inode(inode, 0, 0);
1292
1293	if (info->fsflags & FS_APPEND_FL)
1294		stat->attributes |= STATX_ATTR_APPEND;
1295	if (info->fsflags & FS_IMMUTABLE_FL)
1296		stat->attributes |= STATX_ATTR_IMMUTABLE;
1297	if (info->fsflags & FS_NODUMP_FL)
1298		stat->attributes |= STATX_ATTR_NODUMP;
1299	stat->attributes_mask |= (STATX_ATTR_APPEND |
1300			STATX_ATTR_IMMUTABLE |
1301			STATX_ATTR_NODUMP);
1302	generic_fillattr(idmap, request_mask, inode, stat);
1303
1304	if (shmem_huge_global_enabled(inode, 0, 0, false, NULL, 0))
1305		stat->blksize = HPAGE_PMD_SIZE;
1306
1307	if (request_mask & STATX_BTIME) {
1308		stat->result_mask |= STATX_BTIME;
1309		stat->btime.tv_sec = info->i_crtime.tv_sec;
1310		stat->btime.tv_nsec = info->i_crtime.tv_nsec;
1311	}
1312
1313	return 0;
1314}
1315
1316static int shmem_setattr(struct mnt_idmap *idmap,
1317			 struct dentry *dentry, struct iattr *attr)
1318{
1319	struct inode *inode = d_inode(dentry);
1320	struct shmem_inode_info *info = SHMEM_I(inode);
1321	int error;
1322	bool update_mtime = false;
1323	bool update_ctime = true;
1324
1325	error = setattr_prepare(idmap, dentry, attr);
1326	if (error)
1327		return error;
1328
1329	if ((info->seals & F_SEAL_EXEC) && (attr->ia_valid & ATTR_MODE)) {
1330		if ((inode->i_mode ^ attr->ia_mode) & 0111) {
1331			return -EPERM;
1332		}
1333	}
1334
1335	if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
1336		loff_t oldsize = inode->i_size;
1337		loff_t newsize = attr->ia_size;
1338
1339		/* protected by i_rwsem */
1340		if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) ||
1341		    (newsize > oldsize && (info->seals & F_SEAL_GROW)))
1342			return -EPERM;
1343
1344		if (newsize != oldsize) {
1345			if (info->flags & SHMEM_F_MAPPING_FROZEN)
1346				return -EPERM;
1347			error = shmem_reacct_size(SHMEM_I(inode)->flags,
1348					oldsize, newsize);
1349			if (error)
1350				return error;
1351			i_size_write(inode, newsize);
1352			update_mtime = true;
1353		} else {
1354			update_ctime = false;
1355		}
1356		if (newsize <= oldsize) {
1357			loff_t holebegin = round_up(newsize, PAGE_SIZE);
1358			if (oldsize > holebegin)
1359				unmap_mapping_range(inode->i_mapping,
1360							holebegin, 0, 1);
1361			if (info->alloced)
1362				shmem_truncate_range(inode,
1363							newsize, (loff_t)-1);
1364			/* unmap again to remove racily COWed private pages */
1365			if (oldsize > holebegin)
1366				unmap_mapping_range(inode->i_mapping,
1367							holebegin, 0, 1);
1368		}
1369	}
1370
1371	if (is_quota_modification(idmap, inode, attr)) {
1372		error = dquot_initialize(inode);
1373		if (error)
1374			return error;
1375	}
1376
1377	/* Transfer quota accounting */
1378	if (i_uid_needs_update(idmap, attr, inode) ||
1379	    i_gid_needs_update(idmap, attr, inode)) {
1380		error = dquot_transfer(idmap, inode, attr);
1381		if (error)
1382			return error;
1383	}
1384
1385	setattr_copy(idmap, inode, attr);
1386	if (attr->ia_valid & ATTR_MODE)
1387		error = posix_acl_chmod(idmap, dentry, inode->i_mode);
1388	if (!error && update_ctime) {
1389		inode_set_ctime_current(inode);
1390		if (update_mtime)
1391			inode_set_mtime_to_ts(inode, inode_get_ctime(inode));
1392		inode_inc_iversion(inode);
1393	}
1394	return error;
1395}
1396
1397static void shmem_evict_inode(struct inode *inode)
1398{
1399	struct shmem_inode_info *info = SHMEM_I(inode);
1400	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
1401	size_t freed = 0;
1402
1403	if (shmem_mapping(inode->i_mapping)) {
1404		shmem_unacct_size(info->flags, inode->i_size);
1405		inode->i_size = 0;
1406		mapping_set_exiting(inode->i_mapping);
1407		shmem_truncate_range(inode, 0, (loff_t)-1);
1408		if (!list_empty(&info->shrinklist)) {
1409			spin_lock(&sbinfo->shrinklist_lock);
1410			if (!list_empty(&info->shrinklist)) {
1411				list_del_init(&info->shrinklist);
1412				sbinfo->shrinklist_len--;
1413			}
1414			spin_unlock(&sbinfo->shrinklist_lock);
1415		}
1416		while (!list_empty(&info->swaplist)) {
1417			/* Wait while shmem_unuse() is scanning this inode... */
1418			wait_var_event(&info->stop_eviction,
1419				       !atomic_read(&info->stop_eviction));
1420			spin_lock(&shmem_swaplist_lock);
1421			/* ...but beware of the race if we peeked too early */
1422			if (!atomic_read(&info->stop_eviction))
1423				list_del_init(&info->swaplist);
1424			spin_unlock(&shmem_swaplist_lock);
1425		}
1426	}
1427
1428	simple_xattrs_free(&info->xattrs, sbinfo->max_inodes ? &freed : NULL);
1429	shmem_free_inode(inode->i_sb, freed);
1430	WARN_ON(inode->i_blocks);
1431	clear_inode(inode);
1432#ifdef CONFIG_TMPFS_QUOTA
1433	dquot_free_inode(inode);
1434	dquot_drop(inode);
1435#endif
1436}
1437
1438static unsigned int shmem_find_swap_entries(struct address_space *mapping,
1439				pgoff_t start, struct folio_batch *fbatch,
1440				pgoff_t *indices, unsigned int type)
1441{
1442	XA_STATE(xas, &mapping->i_pages, start);
1443	struct folio *folio;
1444	swp_entry_t entry;
1445
1446	rcu_read_lock();
1447	xas_for_each(&xas, folio, ULONG_MAX) {
1448		if (xas_retry(&xas, folio))
1449			continue;
1450
1451		if (!xa_is_value(folio))
1452			continue;
1453
1454		entry = radix_to_swp_entry(folio);
1455		/*
1456		 * swapin error entries can be found in the mapping. But they're
1457		 * deliberately ignored here as we've done everything we can do.
1458		 */
1459		if (swp_type(entry) != type)
1460			continue;
1461
1462		indices[folio_batch_count(fbatch)] = xas.xa_index;
1463		if (!folio_batch_add(fbatch, folio))
1464			break;
1465
1466		if (need_resched()) {
1467			xas_pause(&xas);
1468			cond_resched_rcu();
1469		}
1470	}
1471	rcu_read_unlock();
1472
1473	return folio_batch_count(fbatch);
1474}
1475
1476/*
1477 * Move the swapped pages for an inode to page cache. Returns the count
1478 * of pages swapped in, or the error in case of failure.
1479 */
1480static int shmem_unuse_swap_entries(struct inode *inode,
1481		struct folio_batch *fbatch, pgoff_t *indices)
1482{
1483	int i = 0;
1484	int ret = 0;
1485	int error = 0;
1486	struct address_space *mapping = inode->i_mapping;
1487
1488	for (i = 0; i < folio_batch_count(fbatch); i++) {
1489		struct folio *folio = fbatch->folios[i];
1490
1491		error = shmem_swapin_folio(inode, indices[i], &folio, SGP_CACHE,
1492					mapping_gfp_mask(mapping), NULL, NULL);
1493		if (error == 0) {
1494			folio_unlock(folio);
1495			folio_put(folio);
1496			ret++;
1497		}
1498		if (error == -ENOMEM)
1499			break;
1500		error = 0;
1501	}
1502	return error ? error : ret;
1503}
1504
1505/*
1506 * If swap found in inode, free it and move page from swapcache to filecache.
1507 */
1508static int shmem_unuse_inode(struct inode *inode, unsigned int type)
1509{
1510	struct address_space *mapping = inode->i_mapping;
1511	pgoff_t start = 0;
1512	struct folio_batch fbatch;
1513	pgoff_t indices[PAGEVEC_SIZE];
1514	int ret = 0;
1515
1516	do {
1517		folio_batch_init(&fbatch);
1518		if (!shmem_find_swap_entries(mapping, start, &fbatch,
1519					     indices, type)) {
1520			ret = 0;
1521			break;
1522		}
1523
1524		ret = shmem_unuse_swap_entries(inode, &fbatch, indices);
1525		if (ret < 0)
1526			break;
1527
1528		start = indices[folio_batch_count(&fbatch) - 1];
1529	} while (true);
1530
1531	return ret;
1532}
1533
1534/*
1535 * Read all the shared memory data that resides in the swap
1536 * device 'type' back into memory, so the swap device can be
1537 * unused.
1538 */
1539int shmem_unuse(unsigned int type)
1540{
1541	struct shmem_inode_info *info, *next;
1542	int error = 0;
1543
1544	if (list_empty(&shmem_swaplist))
1545		return 0;
1546
1547	spin_lock(&shmem_swaplist_lock);
1548start_over:
1549	list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) {
1550		if (!info->swapped) {
1551			list_del_init(&info->swaplist);
1552			continue;
1553		}
1554		/*
1555		 * Drop the swaplist mutex while searching the inode for swap;
1556		 * but before doing so, make sure shmem_evict_inode() will not
1557		 * remove placeholder inode from swaplist, nor let it be freed
1558		 * (igrab() would protect from unlink, but not from unmount).
1559		 */
1560		atomic_inc(&info->stop_eviction);
1561		spin_unlock(&shmem_swaplist_lock);
1562
1563		error = shmem_unuse_inode(&info->vfs_inode, type);
1564		cond_resched();
1565
1566		spin_lock(&shmem_swaplist_lock);
1567		if (atomic_dec_and_test(&info->stop_eviction))
1568			wake_up_var(&info->stop_eviction);
1569		if (error)
1570			break;
1571		if (list_empty(&info->swaplist))
1572			goto start_over;
1573		next = list_next_entry(info, swaplist);
1574		if (!info->swapped)
1575			list_del_init(&info->swaplist);
1576	}
1577	spin_unlock(&shmem_swaplist_lock);
1578
1579	return error;
1580}
1581
1582/**
1583 * shmem_writeout - Write the folio to swap
1584 * @folio: The folio to write
1585 * @plug: swap plug
1586 * @folio_list: list to put back folios on split
1587 *
1588 * Move the folio from the page cache to the swap cache.
1589 */
1590int shmem_writeout(struct folio *folio, struct swap_iocb **plug,
1591		struct list_head *folio_list)
1592{
1593	struct address_space *mapping = folio->mapping;
1594	struct inode *inode = mapping->host;
1595	struct shmem_inode_info *info = SHMEM_I(inode);
1596	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
1597	pgoff_t index;
1598	int nr_pages;
1599	bool split = false;
1600
1601	if ((info->flags & SHMEM_F_LOCKED) || sbinfo->noswap)
1602		goto redirty;
1603
1604	if (!total_swap_pages)
1605		goto redirty;
1606
1607	/*
1608	 * If CONFIG_THP_SWAP is not enabled, the large folio should be
1609	 * split when swapping.
1610	 *
1611	 * And shrinkage of pages beyond i_size does not split swap, so
1612	 * swapout of a large folio crossing i_size needs to split too
1613	 * (unless fallocate has been used to preallocate beyond EOF).
1614	 */
1615	if (folio_test_large(folio)) {
1616		index = shmem_fallocend(inode,
1617			DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE));
1618		if ((index > folio->index && index < folio_next_index(folio)) ||
1619		    !IS_ENABLED(CONFIG_THP_SWAP))
1620			split = true;
1621	}
1622
1623	if (split) {
1624try_split:
1625		/* Ensure the subpages are still dirty */
1626		folio_test_set_dirty(folio);
1627		if (split_folio_to_list(folio, folio_list))
1628			goto redirty;
1629		folio_clear_dirty(folio);
1630	}
1631
1632	index = folio->index;
1633	nr_pages = folio_nr_pages(folio);
1634
1635	/*
1636	 * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC
1637	 * value into swapfile.c, the only way we can correctly account for a
1638	 * fallocated folio arriving here is now to initialize it and write it.
1639	 *
1640	 * That's okay for a folio already fallocated earlier, but if we have
1641	 * not yet completed the fallocation, then (a) we want to keep track
1642	 * of this folio in case we have to undo it, and (b) it may not be a
1643	 * good idea to continue anyway, once we're pushing into swap.  So
1644	 * reactivate the folio, and let shmem_fallocate() quit when too many.
1645	 */
1646	if (!folio_test_uptodate(folio)) {
1647		if (inode->i_private) {
1648			struct shmem_falloc *shmem_falloc;
1649			spin_lock(&inode->i_lock);
1650			shmem_falloc = inode->i_private;
1651			if (shmem_falloc &&
1652			    !shmem_falloc->waitq &&
1653			    index >= shmem_falloc->start &&
1654			    index < shmem_falloc->next)
1655				shmem_falloc->nr_unswapped += nr_pages;
1656			else
1657				shmem_falloc = NULL;
1658			spin_unlock(&inode->i_lock);
1659			if (shmem_falloc)
1660				goto redirty;
1661		}
1662		folio_zero_range(folio, 0, folio_size(folio));
1663		flush_dcache_folio(folio);
1664		folio_mark_uptodate(folio);
1665	}
1666
1667	if (!folio_alloc_swap(folio)) {
1668		bool first_swapped = shmem_recalc_inode(inode, 0, nr_pages);
1669		int error;
1670
1671		/*
1672		 * Add inode to shmem_unuse()'s list of swapped-out inodes,
1673		 * if it's not already there.  Do it now before the folio is
1674		 * removed from page cache, when its pagelock no longer
1675		 * protects the inode from eviction.  And do it now, after
1676		 * we've incremented swapped, because shmem_unuse() will
1677		 * prune a !swapped inode from the swaplist.
1678		 */
1679		if (first_swapped) {
1680			spin_lock(&shmem_swaplist_lock);
1681			if (list_empty(&info->swaplist))
1682				list_add(&info->swaplist, &shmem_swaplist);
1683			spin_unlock(&shmem_swaplist_lock);
1684		}
1685
1686		swap_shmem_alloc(folio->swap, nr_pages);
1687		shmem_delete_from_page_cache(folio, swp_to_radix_entry(folio->swap));
1688
1689		BUG_ON(folio_mapped(folio));
1690		error = swap_writeout(folio, plug);
1691		if (error != AOP_WRITEPAGE_ACTIVATE) {
1692			/* folio has been unlocked */
1693			return error;
1694		}
1695
1696		/*
1697		 * The intention here is to avoid holding on to the swap when
1698		 * zswap was unable to compress and unable to writeback; but
1699		 * it will be appropriate if other reactivate cases are added.
1700		 */
1701		error = shmem_add_to_page_cache(folio, mapping, index,
1702				swp_to_radix_entry(folio->swap),
1703				__GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
1704		/* Swap entry might be erased by racing shmem_free_swap() */
1705		if (!error) {
1706			shmem_recalc_inode(inode, 0, -nr_pages);
1707			swap_free_nr(folio->swap, nr_pages);
1708		}
1709
1710		/*
1711		 * The swap_cache_del_folio() below could be left for
1712		 * shrink_folio_list()'s folio_free_swap() to dispose of;
1713		 * but I'm a little nervous about letting this folio out of
1714		 * shmem_writeout() in a hybrid half-tmpfs-half-swap state
1715		 * e.g. folio_mapping(folio) might give an unexpected answer.
1716		 */
1717		swap_cache_del_folio(folio);
1718		goto redirty;
1719	}
1720	if (nr_pages > 1)
1721		goto try_split;
1722redirty:
1723	folio_mark_dirty(folio);
1724	return AOP_WRITEPAGE_ACTIVATE;	/* Return with folio locked */
1725}
1726EXPORT_SYMBOL_GPL(shmem_writeout);
1727
1728#if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS)
1729static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
1730{
1731	char buffer[64];
1732
1733	if (!mpol || mpol->mode == MPOL_DEFAULT)
1734		return;		/* show nothing */
1735
1736	mpol_to_str(buffer, sizeof(buffer), mpol);
1737
1738	seq_printf(seq, ",mpol=%s", buffer);
1739}
1740
1741static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
1742{
1743	struct mempolicy *mpol = NULL;
1744	if (sbinfo->mpol) {
1745		raw_spin_lock(&sbinfo->stat_lock);	/* prevent replace/use races */
1746		mpol = sbinfo->mpol;
1747		mpol_get(mpol);
1748		raw_spin_unlock(&sbinfo->stat_lock);
1749	}
1750	return mpol;
1751}
1752#else /* !CONFIG_NUMA || !CONFIG_TMPFS */
1753static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
1754{
1755}
1756static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
1757{
1758	return NULL;
1759}
1760#endif /* CONFIG_NUMA && CONFIG_TMPFS */
1761
1762static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info,
1763			pgoff_t index, unsigned int order, pgoff_t *ilx);
1764
1765static struct folio *shmem_swapin_cluster(swp_entry_t swap, gfp_t gfp,
1766			struct shmem_inode_info *info, pgoff_t index)
1767{
1768	struct mempolicy *mpol;
1769	pgoff_t ilx;
1770	struct folio *folio;
1771
1772	mpol = shmem_get_pgoff_policy(info, index, 0, &ilx);
1773	folio = swap_cluster_readahead(swap, gfp, mpol, ilx);
1774	mpol_cond_put(mpol);
1775
1776	return folio;
1777}
1778
1779/*
1780 * Make sure huge_gfp is always more limited than limit_gfp.
1781 * Some of the flags set permissions, while others set limitations.
1782 */
1783static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp)
1784{
1785	gfp_t allowflags = __GFP_IO | __GFP_FS | __GFP_RECLAIM;
1786	gfp_t denyflags = __GFP_NOWARN | __GFP_NORETRY;
1787	gfp_t zoneflags = limit_gfp & GFP_ZONEMASK;
1788	gfp_t result = huge_gfp & ~(allowflags | GFP_ZONEMASK);
1789
1790	/* Allow allocations only from the originally specified zones. */
1791	result |= zoneflags;
1792
1793	/*
1794	 * Minimize the result gfp by taking the union with the deny flags,
1795	 * and the intersection of the allow flags.
1796	 */
1797	result |= (limit_gfp & denyflags);
1798	result |= (huge_gfp & limit_gfp) & allowflags;
1799
1800	return result;
1801}
1802
1803#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1804bool shmem_hpage_pmd_enabled(void)
1805{
1806	if (shmem_huge == SHMEM_HUGE_DENY)
1807		return false;
1808	if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_always))
1809		return true;
1810	if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_madvise))
1811		return true;
1812	if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_within_size))
1813		return true;
1814	if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_inherit) &&
1815	    shmem_huge != SHMEM_HUGE_NEVER)
1816		return true;
1817
1818	return false;
1819}
1820
1821unsigned long shmem_allowable_huge_orders(struct inode *inode,
1822				struct vm_area_struct *vma, pgoff_t index,
1823				loff_t write_end, bool shmem_huge_force)
1824{
1825	unsigned long mask = READ_ONCE(huge_shmem_orders_always);
1826	unsigned long within_size_orders = READ_ONCE(huge_shmem_orders_within_size);
1827	vm_flags_t vm_flags = vma ? vma->vm_flags : 0;
1828	unsigned int global_orders;
1829
1830	if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, vm_flags, shmem_huge_force)))
1831		return 0;
1832
1833	global_orders = shmem_huge_global_enabled(inode, index, write_end,
1834						  shmem_huge_force, vma, vm_flags);
1835	/* Tmpfs huge pages allocation */
1836	if (!vma || !vma_is_anon_shmem(vma))
1837		return global_orders;
1838
1839	/*
1840	 * Following the 'deny' semantics of the top level, force the huge
1841	 * option off from all mounts.
1842	 */
1843	if (shmem_huge == SHMEM_HUGE_DENY)
1844		return 0;
1845
1846	/*
1847	 * Only allow inherit orders if the top-level value is 'force', which
1848	 * means non-PMD sized THP can not override 'huge' mount option now.
1849	 */
1850	if (shmem_huge == SHMEM_HUGE_FORCE)
1851		return READ_ONCE(huge_shmem_orders_inherit);
1852
1853	/* Allow mTHP that will be fully within i_size. */
1854	mask |= shmem_get_orders_within_size(inode, within_size_orders, index, 0);
1855
1856	if (vm_flags & VM_HUGEPAGE)
1857		mask |= READ_ONCE(huge_shmem_orders_madvise);
1858
1859	if (global_orders > 0)
1860		mask |= READ_ONCE(huge_shmem_orders_inherit);
1861
1862	return THP_ORDERS_ALL_FILE_DEFAULT & mask;
1863}
1864
1865static unsigned long shmem_suitable_orders(struct inode *inode, struct vm_fault *vmf,
1866					   struct address_space *mapping, pgoff_t index,
1867					   unsigned long orders)
1868{
1869	struct vm_area_struct *vma = vmf ? vmf->vma : NULL;
1870	pgoff_t aligned_index;
1871	unsigned long pages;
1872	int order;
1873
1874	if (vma) {
1875		orders = thp_vma_suitable_orders(vma, vmf->address, orders);
1876		if (!orders)
1877			return 0;
1878	}
1879
1880	/* Find the highest order that can add into the page cache */
1881	order = highest_order(orders);
1882	while (orders) {
1883		pages = 1UL << order;
1884		aligned_index = round_down(index, pages);
1885		/*
1886		 * Check for conflict before waiting on a huge allocation.
1887		 * Conflict might be that a huge page has just been allocated
1888		 * and added to page cache by a racing thread, or that there
1889		 * is already at least one small page in the huge extent.
1890		 * Be careful to retry when appropriate, but not forever!
1891		 * Elsewhere -EEXIST would be the right code, but not here.
1892		 */
1893		if (!xa_find(&mapping->i_pages, &aligned_index,
1894			     aligned_index + pages - 1, XA_PRESENT))
1895			break;
1896		order = next_order(&orders, order);
1897	}
1898
1899	return orders;
1900}
1901#else
1902static unsigned long shmem_suitable_orders(struct inode *inode, struct vm_fault *vmf,
1903					   struct address_space *mapping, pgoff_t index,
1904					   unsigned long orders)
1905{
1906	return 0;
1907}
1908#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
1909
1910static struct folio *shmem_alloc_folio(gfp_t gfp, int order,
1911		struct shmem_inode_info *info, pgoff_t index)
1912{
1913	struct mempolicy *mpol;
1914	pgoff_t ilx;
1915	struct folio *folio;
1916
1917	mpol = shmem_get_pgoff_policy(info, index, order, &ilx);
1918	folio = folio_alloc_mpol(gfp, order, mpol, ilx, numa_node_id());
1919	mpol_cond_put(mpol);
1920
1921	return folio;
1922}
1923
1924static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf,
1925		gfp_t gfp, struct inode *inode, pgoff_t index,
1926		struct mm_struct *fault_mm, unsigned long orders)
1927{
1928	struct address_space *mapping = inode->i_mapping;
1929	struct shmem_inode_info *info = SHMEM_I(inode);
1930	unsigned long suitable_orders = 0;
1931	struct folio *folio = NULL;
1932	pgoff_t aligned_index;
1933	long pages;
1934	int error, order;
1935
1936	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
1937		orders = 0;
1938
1939	if (orders > 0) {
1940		suitable_orders = shmem_suitable_orders(inode, vmf,
1941							mapping, index, orders);
1942
1943		order = highest_order(suitable_orders);
1944		while (suitable_orders) {
1945			pages = 1UL << order;
1946			aligned_index = round_down(index, pages);
1947			folio = shmem_alloc_folio(gfp, order, info, aligned_index);
1948			if (folio) {
1949				index = aligned_index;
1950				goto allocated;
1951			}
1952
1953			if (pages == HPAGE_PMD_NR)
1954				count_vm_event(THP_FILE_FALLBACK);
1955			count_mthp_stat(order, MTHP_STAT_SHMEM_FALLBACK);
1956			order = next_order(&suitable_orders, order);
1957		}
1958	} else {
1959		pages = 1;
1960		folio = shmem_alloc_folio(gfp, 0, info, index);
1961	}
1962	if (!folio)
1963		return ERR_PTR(-ENOMEM);
1964
1965allocated:
1966	__folio_set_locked(folio);
1967	__folio_set_swapbacked(folio);
1968
1969	gfp &= GFP_RECLAIM_MASK;
1970	error = mem_cgroup_charge(folio, fault_mm, gfp);
1971	if (error) {
1972		if (xa_find(&mapping->i_pages, &index,
1973				index + pages - 1, XA_PRESENT)) {
1974			error = -EEXIST;
1975		} else if (pages > 1) {
1976			if (pages == HPAGE_PMD_NR) {
1977				count_vm_event(THP_FILE_FALLBACK);
1978				count_vm_event(THP_FILE_FALLBACK_CHARGE);
1979			}
1980			count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_FALLBACK);
1981			count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_FALLBACK_CHARGE);
1982		}
1983		goto unlock;
1984	}
1985
1986	error = shmem_add_to_page_cache(folio, mapping, index, NULL, gfp);
1987	if (error)
1988		goto unlock;
1989
1990	error = shmem_inode_acct_blocks(inode, pages);
1991	if (error) {
1992		struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
1993		long freed;
1994		/*
1995		 * Try to reclaim some space by splitting a few
1996		 * large folios beyond i_size on the filesystem.
1997		 */
1998		shmem_unused_huge_shrink(sbinfo, NULL, pages);
1999		/*
2000		 * And do a shmem_recalc_inode() to account for freed pages:
2001		 * except our folio is there in cache, so not quite balanced.
2002		 */
2003		spin_lock(&info->lock);
2004		freed = pages + info->alloced - info->swapped -
2005			READ_ONCE(mapping->nrpages);
2006		if (freed > 0)
2007			info->alloced -= freed;
2008		spin_unlock(&info->lock);
2009		if (freed > 0)
2010			shmem_inode_unacct_blocks(inode, freed);
2011		error = shmem_inode_acct_blocks(inode, pages);
2012		if (error) {
2013			filemap_remove_folio(folio);
2014			goto unlock;
2015		}
2016	}
2017
2018	shmem_recalc_inode(inode, pages, 0);
2019	folio_add_lru(folio);
2020	return folio;
2021
2022unlock:
2023	folio_unlock(folio);
2024	folio_put(folio);
2025	return ERR_PTR(error);
2026}
2027
2028static struct folio *shmem_swap_alloc_folio(struct inode *inode,
2029		struct vm_area_struct *vma, pgoff_t index,
2030		swp_entry_t entry, int order, gfp_t gfp)
2031{
2032	struct shmem_inode_info *info = SHMEM_I(inode);
2033	int nr_pages = 1 << order;
2034	struct folio *new;
2035	gfp_t alloc_gfp;
2036	void *shadow;
2037
2038	/*
2039	 * We have arrived here because our zones are constrained, so don't
2040	 * limit chance of success with further cpuset and node constraints.
2041	 */
2042	gfp &= ~GFP_CONSTRAINT_MASK;
2043	alloc_gfp = gfp;
2044	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
2045		if (WARN_ON_ONCE(order))
2046			return ERR_PTR(-EINVAL);
2047	} else if (order) {
2048		/*
2049		 * If uffd is active for the vma, we need per-page fault
2050		 * fidelity to maintain the uffd semantics, then fallback
2051		 * to swapin order-0 folio, as well as for zswap case.
2052		 * Any existing sub folio in the swap cache also blocks
2053		 * mTHP swapin.
2054		 */
2055		if ((vma && unlikely(userfaultfd_armed(vma))) ||
2056		     !zswap_never_enabled() ||
2057		     non_swapcache_batch(entry, nr_pages) != nr_pages)
2058			goto fallback;
2059
2060		alloc_gfp = limit_gfp_mask(vma_thp_gfp_mask(vma), gfp);
2061	}
2062retry:
2063	new = shmem_alloc_folio(alloc_gfp, order, info, index);
2064	if (!new) {
2065		new = ERR_PTR(-ENOMEM);
2066		goto fallback;
2067	}
2068
2069	if (mem_cgroup_swapin_charge_folio(new, vma ? vma->vm_mm : NULL,
2070					   alloc_gfp, entry)) {
2071		folio_put(new);
2072		new = ERR_PTR(-ENOMEM);
2073		goto fallback;
2074	}
2075
2076	/*
2077	 * Prevent parallel swapin from proceeding with the swap cache flag.
2078	 *
2079	 * Of course there is another possible concurrent scenario as well,
2080	 * that is to say, the swap cache flag of a large folio has already
2081	 * been set by swapcache_prepare(), while another thread may have
2082	 * already split the large swap entry stored in the shmem mapping.
2083	 * In this case, shmem_add_to_page_cache() will help identify the
2084	 * concurrent swapin and return -EEXIST.
2085	 */
2086	if (swapcache_prepare(entry, nr_pages)) {
2087		folio_put(new);
2088		new = ERR_PTR(-EEXIST);
2089		/* Try smaller folio to avoid cache conflict */
2090		goto fallback;
2091	}
2092
2093	__folio_set_locked(new);
2094	__folio_set_swapbacked(new);
2095	new->swap = entry;
2096
2097	memcg1_swapin(entry, nr_pages);
2098	shadow = swap_cache_get_shadow(entry);
2099	if (shadow)
2100		workingset_refault(new, shadow);
2101	folio_add_lru(new);
2102	swap_read_folio(new, NULL);
2103	return new;
2104fallback:
2105	/* Order 0 swapin failed, nothing to fallback to, abort */
2106	if (!order)
2107		return new;
2108	entry.val += index - round_down(index, nr_pages);
2109	alloc_gfp = gfp;
2110	nr_pages = 1;
2111	order = 0;
2112	goto retry;
2113}
2114
2115/*
2116 * When a page is moved from swapcache to shmem filecache (either by the
2117 * usual swapin of shmem_get_folio_gfp(), or by the less common swapoff of
2118 * shmem_unuse_inode()), it may have been read in earlier from swap, in
2119 * ignorance of the mapping it belongs to.  If that mapping has special
2120 * constraints (like the gma500 GEM driver, which requires RAM below 4GB),
2121 * we may need to copy to a suitable page before moving to filecache.
2122 *
2123 * In a future release, this may well be extended to respect cpuset and
2124 * NUMA mempolicy, and applied also to anonymous pages in do_swap_page();
2125 * but for now it is a simple matter of zone.
2126 */
2127static bool shmem_should_replace_folio(struct folio *folio, gfp_t gfp)
2128{
2129	return folio_zonenum(folio) > gfp_zone(gfp);
2130}
2131
2132static int shmem_replace_folio(struct folio **foliop, gfp_t gfp,
2133				struct shmem_inode_info *info, pgoff_t index,
2134				struct vm_area_struct *vma)
2135{
2136	struct swap_cluster_info *ci;
2137	struct folio *new, *old = *foliop;
2138	swp_entry_t entry = old->swap;
2139	int nr_pages = folio_nr_pages(old);
2140	int error = 0;
2141
2142	/*
2143	 * We have arrived here because our zones are constrained, so don't
2144	 * limit chance of success by further cpuset and node constraints.
2145	 */
2146	gfp &= ~GFP_CONSTRAINT_MASK;
2147#ifdef CONFIG_TRANSPARENT_HUGEPAGE
2148	if (nr_pages > 1) {
2149		gfp_t huge_gfp = vma_thp_gfp_mask(vma);
2150
2151		gfp = limit_gfp_mask(huge_gfp, gfp);
2152	}
2153#endif
2154
2155	new = shmem_alloc_folio(gfp, folio_order(old), info, index);
2156	if (!new)
2157		return -ENOMEM;
2158
2159	folio_ref_add(new, nr_pages);
2160	folio_copy(new, old);
2161	flush_dcache_folio(new);
2162
2163	__folio_set_locked(new);
2164	__folio_set_swapbacked(new);
2165	folio_mark_uptodate(new);
2166	new->swap = entry;
2167	folio_set_swapcache(new);
2168
2169	ci = swap_cluster_get_and_lock_irq(old);
2170	__swap_cache_replace_folio(ci, old, new);
2171	mem_cgroup_replace_folio(old, new);
2172	shmem_update_stats(new, nr_pages);
2173	shmem_update_stats(old, -nr_pages);
2174	swap_cluster_unlock_irq(ci);
2175
2176	folio_add_lru(new);
2177	*foliop = new;
2178
2179	folio_clear_swapcache(old);
2180	old->private = NULL;
2181
2182	folio_unlock(old);
2183	/*
2184	 * The old folio are removed from swap cache, drop the 'nr_pages'
2185	 * reference, as well as one temporary reference getting from swap
2186	 * cache.
2187	 */
2188	folio_put_refs(old, nr_pages + 1);
2189	return error;
2190}
2191
2192static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
2193					 struct folio *folio, swp_entry_t swap,
2194					 bool skip_swapcache)
2195{
2196	struct address_space *mapping = inode->i_mapping;
2197	swp_entry_t swapin_error;
2198	void *old;
2199	int nr_pages;
2200
2201	swapin_error = make_poisoned_swp_entry();
2202	old = xa_cmpxchg_irq(&mapping->i_pages, index,
2203			     swp_to_radix_entry(swap),
2204			     swp_to_radix_entry(swapin_error), 0);
2205	if (old != swp_to_radix_entry(swap))
2206		return;
2207
2208	nr_pages = folio_nr_pages(folio);
2209	folio_wait_writeback(folio);
2210	if (!skip_swapcache)
2211		swap_cache_del_folio(folio);
2212	/*
2213	 * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks
2214	 * won't be 0 when inode is released and thus trigger WARN_ON(i_blocks)
2215	 * in shmem_evict_inode().
2216	 */
2217	shmem_recalc_inode(inode, -nr_pages, -nr_pages);
2218	swap_free_nr(swap, nr_pages);
2219}
2220
2221static int shmem_split_large_entry(struct inode *inode, pgoff_t index,
2222				   swp_entry_t swap, gfp_t gfp)
2223{
2224	struct address_space *mapping = inode->i_mapping;
2225	XA_STATE_ORDER(xas, &mapping->i_pages, index, 0);
2226	int split_order = 0;
2227	int i;
2228
2229	/* Convert user data gfp flags to xarray node gfp flags */
2230	gfp &= GFP_RECLAIM_MASK;
2231
2232	for (;;) {
2233		void *old = NULL;
2234		int cur_order;
2235		pgoff_t swap_index;
2236
2237		xas_lock_irq(&xas);
2238		old = xas_load(&xas);
2239		if (!xa_is_value(old) || swp_to_radix_entry(swap) != old) {
2240			xas_set_err(&xas, -EEXIST);
2241			goto unlock;
2242		}
2243
2244		cur_order = xas_get_order(&xas);
2245		if (!cur_order)
2246			goto unlock;
2247
2248		/* Try to split large swap entry in pagecache */
2249		swap_index = round_down(index, 1 << cur_order);
2250		split_order = xas_try_split_min_order(cur_order);
2251
2252		while (cur_order > 0) {
2253			pgoff_t aligned_index =
2254				round_down(index, 1 << cur_order);
2255			pgoff_t swap_offset = aligned_index - swap_index;
2256
2257			xas_set_order(&xas, index, split_order);
2258			xas_try_split(&xas, old, cur_order);
2259			if (xas_error(&xas))
2260				goto unlock;
2261
2262			/*
2263			 * Re-set the swap entry after splitting, and the swap
2264			 * offset of the original large entry must be continuous.
2265			 */
2266			for (i = 0; i < 1 << cur_order;
2267			     i += (1 << split_order)) {
2268				swp_entry_t tmp;
2269
2270				tmp = swp_entry(swp_type(swap),
2271						swp_offset(swap) + swap_offset +
2272							i);
2273				__xa_store(&mapping->i_pages, aligned_index + i,
2274					   swp_to_radix_entry(tmp), 0);
2275			}
2276			cur_order = split_order;
2277			split_order = xas_try_split_min_order(split_order);
2278		}
2279
2280unlock:
2281		xas_unlock_irq(&xas);
2282
2283		if (!xas_nomem(&xas, gfp))
2284			break;
2285	}
2286
2287	if (xas_error(&xas))
2288		return xas_error(&xas);
2289
2290	return 0;
2291}
2292
2293/*
2294 * Swap in the folio pointed to by *foliop.
2295 * Caller has to make sure that *foliop contains a valid swapped folio.
2296 * Returns 0 and the folio in foliop if success. On failure, returns the
2297 * error code and NULL in *foliop.
2298 */
2299static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
2300			     struct folio **foliop, enum sgp_type sgp,
2301			     gfp_t gfp, struct vm_area_struct *vma,
2302			     vm_fault_t *fault_type)
2303{
2304	struct address_space *mapping = inode->i_mapping;
2305	struct mm_struct *fault_mm = vma ? vma->vm_mm : NULL;
2306	struct shmem_inode_info *info = SHMEM_I(inode);
2307	swp_entry_t swap;
2308	softleaf_t index_entry;
2309	struct swap_info_struct *si;
2310	struct folio *folio = NULL;
2311	bool skip_swapcache = false;
2312	int error, nr_pages, order;
2313	pgoff_t offset;
2314
2315	VM_BUG_ON(!*foliop || !xa_is_value(*foliop));
2316	index_entry = radix_to_swp_entry(*foliop);
2317	swap = index_entry;
2318	*foliop = NULL;
2319
2320	if (softleaf_is_poison_marker(index_entry))
2321		return -EIO;
2322
2323	si = get_swap_device(index_entry);
2324	order = shmem_confirm_swap(mapping, index, index_entry);
2325	if (unlikely(!si)) {
2326		if (order < 0)
2327			return -EEXIST;
2328		else
2329			return -EINVAL;
2330	}
2331	if (unlikely(order < 0)) {
2332		put_swap_device(si);
2333		return -EEXIST;
2334	}
2335
2336	/* index may point to the middle of a large entry, get the sub entry */
2337	if (order) {
2338		offset = index - round_down(index, 1 << order);
2339		swap = swp_entry(swp_type(swap), swp_offset(swap) + offset);
2340	}
2341
2342	/* Look it up and read it in.. */
2343	folio = swap_cache_get_folio(swap);
2344	if (!folio) {
2345		if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) {
2346			/* Direct swapin skipping swap cache & readahead */
2347			folio = shmem_swap_alloc_folio(inode, vma, index,
2348						       index_entry, order, gfp);
2349			if (IS_ERR(folio)) {
2350				error = PTR_ERR(folio);
2351				folio = NULL;
2352				goto failed;
2353			}
2354			skip_swapcache = true;
2355		} else {
2356			/* Cached swapin only supports order 0 folio */
2357			folio = shmem_swapin_cluster(swap, gfp, info, index);
2358			if (!folio) {
2359				error = -ENOMEM;
2360				goto failed;
2361			}
2362		}
2363		if (fault_type) {
2364			*fault_type |= VM_FAULT_MAJOR;
2365			count_vm_event(PGMAJFAULT);
2366			count_memcg_event_mm(fault_mm, PGMAJFAULT);
2367		}
2368	} else {
2369		swap_update_readahead(folio, NULL, 0);
2370	}
2371
2372	if (order > folio_order(folio)) {
2373		/*
2374		 * Swapin may get smaller folios due to various reasons:
2375		 * It may fallback to order 0 due to memory pressure or race,
2376		 * swap readahead may swap in order 0 folios into swapcache
2377		 * asynchronously, while the shmem mapping can still stores
2378		 * large swap entries. In such cases, we should split the
2379		 * large swap entry to prevent possible data corruption.
2380		 */
2381		error = shmem_split_large_entry(inode, index, index_entry, gfp);
2382		if (error)
2383			goto failed_nolock;
2384	}
2385
2386	/*
2387	 * If the folio is large, round down swap and index by folio size.
2388	 * No matter what race occurs, the swap layer ensures we either get
2389	 * a valid folio that has its swap entry aligned by size, or a
2390	 * temporarily invalid one which we'll abort very soon and retry.
2391	 *
2392	 * shmem_add_to_page_cache ensures the whole range contains expected
2393	 * entries and prevents any corruption, so any race split is fine
2394	 * too, it will succeed as long as the entries are still there.
2395	 */
2396	nr_pages = folio_nr_pages(folio);
2397	if (nr_pages > 1) {
2398		swap.val = round_down(swap.val, nr_pages);
2399		index = round_down(index, nr_pages);
2400	}
2401
2402	/*
2403	 * We have to do this with the folio locked to prevent races.
2404	 * The shmem_confirm_swap below only checks if the first swap
2405	 * entry matches the folio, that's enough to ensure the folio
2406	 * is not used outside of shmem, as shmem swap entries
2407	 * and swap cache folios are never partially freed.
2408	 */
2409	folio_lock(folio);
2410	if ((!skip_swapcache && !folio_test_swapcache(folio)) ||
2411	    shmem_confirm_swap(mapping, index, swap) < 0 ||
2412	    folio->swap.val != swap.val) {
2413		error = -EEXIST;
2414		goto unlock;
2415	}
2416	if (!folio_test_uptodate(folio)) {
2417		error = -EIO;
2418		goto failed;
2419	}
2420	folio_wait_writeback(folio);
2421
2422	/*
2423	 * Some architectures may have to restore extra metadata to the
2424	 * folio after reading from swap.
2425	 */
2426	arch_swap_restore(folio_swap(swap, folio), folio);
2427
2428	if (shmem_should_replace_folio(folio, gfp)) {
2429		error = shmem_replace_folio(&folio, gfp, info, index, vma);
2430		if (error)
2431			goto failed;
2432	}
2433
2434	error = shmem_add_to_page_cache(folio, mapping, index,
2435					swp_to_radix_entry(swap), gfp);
2436	if (error)
2437		goto failed;
2438
2439	shmem_recalc_inode(inode, 0, -nr_pages);
2440
2441	if (sgp == SGP_WRITE)
2442		folio_mark_accessed(folio);
2443
2444	if (skip_swapcache) {
2445		folio->swap.val = 0;
2446		swapcache_clear(si, swap, nr_pages);
2447	} else {
2448		swap_cache_del_folio(folio);
2449	}
2450	folio_mark_dirty(folio);
2451	swap_free_nr(swap, nr_pages);
2452	put_swap_device(si);
2453
2454	*foliop = folio;
2455	return 0;
2456failed:
2457	if (shmem_confirm_swap(mapping, index, swap) < 0)
2458		error = -EEXIST;
2459	if (error == -EIO)
2460		shmem_set_folio_swapin_error(inode, index, folio, swap,
2461					     skip_swapcache);
2462unlock:
2463	if (folio)
2464		folio_unlock(folio);
2465failed_nolock:
2466	if (skip_swapcache)
2467		swapcache_clear(si, folio->swap, folio_nr_pages(folio));
2468	if (folio)
2469		folio_put(folio);
2470	put_swap_device(si);
2471
2472	return error;
2473}
2474
2475/*
2476 * shmem_get_folio_gfp - find page in cache, or get from swap, or allocate
2477 *
2478 * If we allocate a new one we do not mark it dirty. That's up to the
2479 * vm. If we swap it in we mark it dirty since we also free the swap
2480 * entry since a page cannot live in both the swap and page cache.
2481 *
2482 * vmf and fault_type are only supplied by shmem_fault: otherwise they are NULL.
2483 */
2484static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index,
2485		loff_t write_end, struct folio **foliop, enum sgp_type sgp,
2486		gfp_t gfp, struct vm_fault *vmf, vm_fault_t *fault_type)
2487{
2488	struct vm_area_struct *vma = vmf ? vmf->vma : NULL;
2489	struct mm_struct *fault_mm;
2490	struct folio *folio;
2491	int error;
2492	bool alloced;
2493	unsigned long orders = 0;
2494
2495	if (WARN_ON_ONCE(!shmem_mapping(inode->i_mapping)))
2496		return -EINVAL;
2497
2498	if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT))
2499		return -EFBIG;
2500repeat:
2501	if (sgp <= SGP_CACHE &&
2502	    ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode))
2503		return -EINVAL;
2504
2505	alloced = false;
2506	fault_mm = vma ? vma->vm_mm : NULL;
2507
2508	folio = filemap_get_entry(inode->i_mapping, index);
2509	if (folio && vma && userfaultfd_minor(vma)) {
2510		if (!xa_is_value(folio))
2511			folio_put(folio);
2512		*fault_type = handle_userfault(vmf, VM_UFFD_MINOR);
2513		return 0;
2514	}
2515
2516	if (xa_is_value(folio)) {
2517		error = shmem_swapin_folio(inode, index, &folio,
2518					   sgp, gfp, vma, fault_type);
2519		if (error == -EEXIST)
2520			goto repeat;
2521
2522		*foliop = folio;
2523		return error;
2524	}
2525
2526	if (folio) {
2527		folio_lock(folio);
2528
2529		/* Has the folio been truncated or swapped out? */
2530		if (unlikely(folio->mapping != inode->i_mapping)) {
2531			folio_unlock(folio);
2532			folio_put(folio);
2533			goto repeat;
2534		}
2535		if (sgp == SGP_WRITE)
2536			folio_mark_accessed(folio);
2537		if (folio_test_uptodate(folio))
2538			goto out;
2539		/* fallocated folio */
2540		if (sgp != SGP_READ)
2541			goto clear;
2542		folio_unlock(folio);
2543		folio_put(folio);
2544	}
2545
2546	/*
2547	 * SGP_READ: succeed on hole, with NULL folio, letting caller zero.
2548	 * SGP_NOALLOC: fail on hole, with NULL folio, letting caller fail.
2549	 */
2550	*foliop = NULL;
2551	if (sgp == SGP_READ)
2552		return 0;
2553	if (sgp == SGP_NOALLOC)
2554		return -ENOENT;
2555
2556	/*
2557	 * Fast cache lookup and swap lookup did not find it: allocate.
2558	 */
2559
2560	if (vma && userfaultfd_missing(vma)) {
2561		*fault_type = handle_userfault(vmf, VM_UFFD_MISSING);
2562		return 0;
2563	}
2564
2565	/* Find hugepage orders that are allowed for anonymous shmem and tmpfs. */
2566	orders = shmem_allowable_huge_orders(inode, vma, index, write_end, false);
2567	if (orders > 0) {
2568		gfp_t huge_gfp;
2569
2570		huge_gfp = vma_thp_gfp_mask(vma);
2571		huge_gfp = limit_gfp_mask(huge_gfp, gfp);
2572		folio = shmem_alloc_and_add_folio(vmf, huge_gfp,
2573				inode, index, fault_mm, orders);
2574		if (!IS_ERR(folio)) {
2575			if (folio_test_pmd_mappable(folio))
2576				count_vm_event(THP_FILE_ALLOC);
2577			count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_ALLOC);
2578			goto alloced;
2579		}
2580		if (PTR_ERR(folio) == -EEXIST)
2581			goto repeat;
2582	}
2583
2584	folio = shmem_alloc_and_add_folio(vmf, gfp, inode, index, fault_mm, 0);
2585	if (IS_ERR(folio)) {
2586		error = PTR_ERR(folio);
2587		if (error == -EEXIST)
2588			goto repeat;
2589		folio = NULL;
2590		goto unlock;
2591	}
2592
2593alloced:
2594	alloced = true;
2595	if (folio_test_large(folio) &&
2596	    DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) <
2597					folio_next_index(folio)) {
2598		struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
2599		struct shmem_inode_info *info = SHMEM_I(inode);
2600		/*
2601		 * Part of the large folio is beyond i_size: subject
2602		 * to shrink under memory pressure.
2603		 */
2604		spin_lock(&sbinfo->shrinklist_lock);
2605		/*
2606		 * _careful to defend against unlocked access to
2607		 * ->shrink_list in shmem_unused_huge_shrink()
2608		 */
2609		if (list_empty_careful(&info->shrinklist)) {
2610			list_add_tail(&info->shrinklist,
2611				      &sbinfo->shrinklist);
2612			sbinfo->shrinklist_len++;
2613		}
2614		spin_unlock(&sbinfo->shrinklist_lock);
2615	}
2616
2617	if (sgp == SGP_WRITE)
2618		folio_set_referenced(folio);
2619	/*
2620	 * Let SGP_FALLOC use the SGP_WRITE optimization on a new folio.
2621	 */
2622	if (sgp == SGP_FALLOC)
2623		sgp = SGP_WRITE;
2624clear:
2625	/*
2626	 * Let SGP_WRITE caller clear ends if write does not fill folio;
2627	 * but SGP_FALLOC on a folio fallocated earlier must initialize
2628	 * it now, lest undo on failure cancel our earlier guarantee.
2629	 */
2630	if (sgp != SGP_WRITE && !folio_test_uptodate(folio)) {
2631		long i, n = folio_nr_pages(folio);
2632
2633		for (i = 0; i < n; i++)
2634			clear_highpage(folio_page(folio, i));
2635		flush_dcache_folio(folio);
2636		folio_mark_uptodate(folio);
2637	}
2638
2639	/* Perhaps the file has been truncated since we checked */
2640	if (sgp <= SGP_CACHE &&
2641	    ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
2642		error = -EINVAL;
2643		goto unlock;
2644	}
2645out:
2646	*foliop = folio;
2647	return 0;
2648
2649	/*
2650	 * Error recovery.
2651	 */
2652unlock:
2653	if (alloced)
2654		filemap_remove_folio(folio);
2655	shmem_recalc_inode(inode, 0, 0);
2656	if (folio) {
2657		folio_unlock(folio);
2658		folio_put(folio);
2659	}
2660	return error;
2661}
2662
2663/**
2664 * shmem_get_folio - find, and lock a shmem folio.
2665 * @inode:	inode to search
2666 * @index:	the page index.
2667 * @write_end:	end of a write, could extend inode size
2668 * @foliop:	pointer to the folio if found
2669 * @sgp:	SGP_* flags to control behavior
2670 *
2671 * Looks up the page cache entry at @inode & @index.  If a folio is
2672 * present, it is returned locked with an increased refcount.
2673 *
2674 * If the caller modifies data in the folio, it must call folio_mark_dirty()
2675 * before unlocking the folio to ensure that the folio is not reclaimed.
2676 * There is no need to reserve space before calling folio_mark_dirty().
2677 *
2678 * When no folio is found, the behavior depends on @sgp:
2679 *  - for SGP_READ, *@foliop is %NULL and 0 is returned
2680 *  - for SGP_NOALLOC, *@foliop is %NULL and -ENOENT is returned
2681 *  - for all other flags a new folio is allocated, inserted into the
2682 *    page cache and returned locked in @foliop.
2683 *
2684 * Context: May sleep.
2685 * Return: 0 if successful, else a negative error code.
2686 */
2687int shmem_get_folio(struct inode *inode, pgoff_t index, loff_t write_end,
2688		    struct folio **foliop, enum sgp_type sgp)
2689{
2690	return shmem_get_folio_gfp(inode, index, write_end, foliop, sgp,
2691			mapping_gfp_mask(inode->i_mapping), NULL, NULL);
2692}
2693EXPORT_SYMBOL_GPL(shmem_get_folio);
2694
2695/*
2696 * This is like autoremove_wake_function, but it removes the wait queue
2697 * entry unconditionally - even if something else had already woken the
2698 * target.
2699 */
2700static int synchronous_wake_function(wait_queue_entry_t *wait,
2701			unsigned int mode, int sync, void *key)
2702{
2703	int ret = default_wake_function(wait, mode, sync, key);
2704	list_del_init(&wait->entry);
2705	return ret;
2706}
2707
2708/*
2709 * Trinity finds that probing a hole which tmpfs is punching can
2710 * prevent the hole-punch from ever completing: which in turn
2711 * locks writers out with its hold on i_rwsem.  So refrain from
2712 * faulting pages into the hole while it's being punched.  Although
2713 * shmem_undo_range() does remove the additions, it may be unable to
2714 * keep up, as each new page needs its own unmap_mapping_range() call,
2715 * and the i_mmap tree grows ever slower to scan if new vmas are added.
2716 *
2717 * It does not matter if we sometimes reach this check just before the
2718 * hole-punch begins, so that one fault then races with the punch:
2719 * we just need to make racing faults a rare case.
2720 *
2721 * The implementation below would be much simpler if we just used a
2722 * standard mutex or completion: but we cannot take i_rwsem in fault,
2723 * and bloating every shmem inode for this unlikely case would be sad.
2724 */
2725static vm_fault_t shmem_falloc_wait(struct vm_fault *vmf, struct inode *inode)
2726{
2727	struct shmem_falloc *shmem_falloc;
2728	struct file *fpin = NULL;
2729	vm_fault_t ret = 0;
2730
2731	spin_lock(&inode->i_lock);
2732	shmem_falloc = inode->i_private;
2733	if (shmem_falloc &&
2734	    shmem_falloc->waitq &&
2735	    vmf->pgoff >= shmem_falloc->start &&
2736	    vmf->pgoff < shmem_falloc->next) {
2737		wait_queue_head_t *shmem_falloc_waitq;
2738		DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function);
2739
2740		ret = VM_FAULT_NOPAGE;
2741		fpin = maybe_unlock_mmap_for_io(vmf, NULL);
2742		shmem_falloc_waitq = shmem_falloc->waitq;
2743		prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait,
2744				TASK_UNINTERRUPTIBLE);
2745		spin_unlock(&inode->i_lock);
2746		schedule();
2747
2748		/*
2749		 * shmem_falloc_waitq points into the shmem_fallocate()
2750		 * stack of the hole-punching task: shmem_falloc_waitq
2751		 * is usually invalid by the time we reach here, but
2752		 * finish_wait() does not dereference it in that case;
2753		 * though i_lock needed lest racing with wake_up_all().
2754		 */
2755		spin_lock(&inode->i_lock);
2756		finish_wait(shmem_falloc_waitq, &shmem_fault_wait);
2757	}
2758	spin_unlock(&inode->i_lock);
2759	if (fpin) {
2760		fput(fpin);
2761		ret = VM_FAULT_RETRY;
2762	}
2763	return ret;
2764}
2765
2766static vm_fault_t shmem_fault(struct vm_fault *vmf)
2767{
2768	struct inode *inode = file_inode(vmf->vma->vm_file);
2769	gfp_t gfp = mapping_gfp_mask(inode->i_mapping);
2770	struct folio *folio = NULL;
2771	vm_fault_t ret = 0;
2772	int err;
2773
2774	/*
2775	 * Trinity finds that probing a hole which tmpfs is punching can
2776	 * prevent the hole-punch from ever completing: noted in i_private.
2777	 */
2778	if (unlikely(inode->i_private)) {
2779		ret = shmem_falloc_wait(vmf, inode);
2780		if (ret)
2781			return ret;
2782	}
2783
2784	WARN_ON_ONCE(vmf->page != NULL);
2785	err = shmem_get_folio_gfp(inode, vmf->pgoff, 0, &folio, SGP_CACHE,
2786				  gfp, vmf, &ret);
2787	if (err)
2788		return vmf_error(err);
2789	if (folio) {
2790		vmf->page = folio_file_page(folio, vmf->pgoff);
2791		ret |= VM_FAULT_LOCKED;
2792	}
2793	return ret;
2794}
2795
2796unsigned long shmem_get_unmapped_area(struct file *file,
2797				      unsigned long uaddr, unsigned long len,
2798				      unsigned long pgoff, unsigned long flags)
2799{
2800	unsigned long addr;
2801	unsigned long offset;
2802	unsigned long inflated_len;
2803	unsigned long inflated_addr;
2804	unsigned long inflated_offset;
2805	unsigned long hpage_size;
2806
2807	if (len > TASK_SIZE)
2808		return -ENOMEM;
2809
2810	addr = mm_get_unmapped_area(file, uaddr, len, pgoff, flags);
2811
2812	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
2813		return addr;
2814	if (IS_ERR_VALUE(addr))
2815		return addr;
2816	if (addr & ~PAGE_MASK)
2817		return addr;
2818	if (addr > TASK_SIZE - len)
2819		return addr;
2820
2821	if (shmem_huge == SHMEM_HUGE_DENY)
2822		return addr;
2823	if (flags & MAP_FIXED)
2824		return addr;
2825	/*
2826	 * Our priority is to support MAP_SHARED mapped hugely;
2827	 * and support MAP_PRIVATE mapped hugely too, until it is COWed.
2828	 * But if caller specified an address hint and we allocated area there
2829	 * successfully, respect that as before.
2830	 */
2831	if (uaddr == addr)
2832		return addr;
2833
2834	hpage_size = HPAGE_PMD_SIZE;
2835	if (shmem_huge != SHMEM_HUGE_FORCE) {
2836		struct super_block *sb;
2837		unsigned long __maybe_unused hpage_orders;
2838		int order = 0;
2839
2840		if (file) {
2841			VM_BUG_ON(file->f_op != &shmem_file_operations);
2842			sb = file_inode(file)->i_sb;
2843		} else {
2844			/*
2845			 * Called directly from mm/mmap.c, or drivers/char/mem.c
2846			 * for "/dev/zero", to create a shared anonymous object.
2847			 */
2848			if (IS_ERR(shm_mnt))
2849				return addr;
2850			sb = shm_mnt->mnt_sb;
2851
2852			/*
2853			 * Find the highest mTHP order used for anonymous shmem to
2854			 * provide a suitable alignment address.
2855			 */
2856#ifdef CONFIG_TRANSPARENT_HUGEPAGE
2857			hpage_orders = READ_ONCE(huge_shmem_orders_always);
2858			hpage_orders |= READ_ONCE(huge_shmem_orders_within_size);
2859			hpage_orders |= READ_ONCE(huge_shmem_orders_madvise);
2860			if (SHMEM_SB(sb)->huge != SHMEM_HUGE_NEVER)
2861				hpage_orders |= READ_ONCE(huge_shmem_orders_inherit);
2862
2863			if (hpage_orders > 0) {
2864				order = highest_order(hpage_orders);
2865				hpage_size = PAGE_SIZE << order;
2866			}
2867#endif
2868		}
2869		if (SHMEM_SB(sb)->huge == SHMEM_HUGE_NEVER && !order)
2870			return addr;
2871	}
2872
2873	if (len < hpage_size)
2874		return addr;
2875
2876	offset = (pgoff << PAGE_SHIFT) & (hpage_size - 1);
2877	if (offset && offset + len < 2 * hpage_size)
2878		return addr;
2879	if ((addr & (hpage_size - 1)) == offset)
2880		return addr;
2881
2882	inflated_len = len + hpage_size - PAGE_SIZE;
2883	if (inflated_len > TASK_SIZE)
2884		return addr;
2885	if (inflated_len < len)
2886		return addr;
2887
2888	inflated_addr = mm_get_unmapped_area(NULL, uaddr, inflated_len, 0, flags);
2889	if (IS_ERR_VALUE(inflated_addr))
2890		return addr;
2891	if (inflated_addr & ~PAGE_MASK)
2892		return addr;
2893
2894	inflated_offset = inflated_addr & (hpage_size - 1);
2895	inflated_addr += offset - inflated_offset;
2896	if (inflated_offset > offset)
2897		inflated_addr += hpage_size;
2898
2899	if (inflated_addr > TASK_SIZE - len)
2900		return addr;
2901	return inflated_addr;
2902}
2903
2904#ifdef CONFIG_NUMA
2905static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
2906{
2907	struct inode *inode = file_inode(vma->vm_file);
2908	return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol);
2909}
2910
2911static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
2912					  unsigned long addr, pgoff_t *ilx)
2913{
2914	struct inode *inode = file_inode(vma->vm_file);
2915	pgoff_t index;
2916
2917	/*
2918	 * Bias interleave by inode number to distribute better across nodes;
2919	 * but this interface is independent of which page order is used, so
2920	 * supplies only that bias, letting caller apply the offset (adjusted
2921	 * by page order, as in shmem_get_pgoff_policy() and get_vma_policy()).
2922	 */
2923	*ilx = inode->i_ino;
2924	index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2925	return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index);
2926}
2927
2928static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info,
2929			pgoff_t index, unsigned int order, pgoff_t *ilx)
2930{
2931	struct mempolicy *mpol;
2932
2933	/* Bias interleave by inode number to distribute better across nodes */
2934	*ilx = info->vfs_inode.i_ino + (index >> order);
2935
2936	mpol = mpol_shared_policy_lookup(&info->policy, index);
2937	return mpol ? mpol : get_task_policy(current);
2938}
2939#else
2940static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info,
2941			pgoff_t index, unsigned int order, pgoff_t *ilx)
2942{
2943	*ilx = 0;
2944	return NULL;
2945}
2946#endif /* CONFIG_NUMA */
2947
2948int shmem_lock(struct file *file, int lock, struct ucounts *ucounts)
2949{
2950	struct inode *inode = file_inode(file);
2951	struct shmem_inode_info *info = SHMEM_I(inode);
2952	int retval = -ENOMEM;
2953
2954	/*
2955	 * What serializes the accesses to info->flags?
2956	 * ipc_lock_object() when called from shmctl_do_lock(),
2957	 * no serialization needed when called from shm_destroy().
2958	 */
2959	if (lock && !(info->flags & SHMEM_F_LOCKED)) {
2960		if (!user_shm_lock(inode->i_size, ucounts))
2961			goto out_nomem;
2962		info->flags |= SHMEM_F_LOCKED;
2963		mapping_set_unevictable(file->f_mapping);
2964	}
2965	if (!lock && (info->flags & SHMEM_F_LOCKED) && ucounts) {
2966		user_shm_unlock(inode->i_size, ucounts);
2967		info->flags &= ~SHMEM_F_LOCKED;
2968		mapping_clear_unevictable(file->f_mapping);
2969	}
2970	retval = 0;
2971
2972out_nomem:
2973	return retval;
2974}
2975
2976static int shmem_mmap_prepare(struct vm_area_desc *desc)
2977{
2978	struct file *file = desc->file;
2979	struct inode *inode = file_inode(file);
2980
2981	file_accessed(file);
2982	/* This is anonymous shared memory if it is unlinked at the time of mmap */
2983	if (inode->i_nlink)
2984		desc->vm_ops = &shmem_vm_ops;
2985	else
2986		desc->vm_ops = &shmem_anon_vm_ops;
2987	return 0;
2988}
2989
2990static int shmem_file_open(struct inode *inode, struct file *file)
2991{
2992	file->f_mode |= FMODE_CAN_ODIRECT;
2993	return generic_file_open(inode, file);
2994}
2995
2996#ifdef CONFIG_TMPFS_XATTR
2997static int shmem_initxattrs(struct inode *, const struct xattr *, void *);
2998
2999#if IS_ENABLED(CONFIG_UNICODE)
3000/*
3001 * shmem_inode_casefold_flags - Deal with casefold file attribute flag
3002 *
3003 * The casefold file attribute needs some special checks. I can just be added to
3004 * an empty dir, and can't be removed from a non-empty dir.
3005 */
3006static int shmem_inode_casefold_flags(struct inode *inode, unsigned int fsflags,
3007				      struct dentry *dentry, unsigned int *i_flags)
3008{
3009	unsigned int old = inode->i_flags;
3010	struct super_block *sb = inode->i_sb;
3011
3012	if (fsflags & FS_CASEFOLD_FL) {
3013		if (!(old & S_CASEFOLD)) {
3014			if (!sb->s_encoding)
3015				return -EOPNOTSUPP;
3016
3017			if (!S_ISDIR(inode->i_mode))
3018				return -ENOTDIR;
3019
3020			if (dentry && !simple_empty(dentry))
3021				return -ENOTEMPTY;
3022		}
3023
3024		*i_flags = *i_flags | S_CASEFOLD;
3025	} else if (old & S_CASEFOLD) {
3026		if (dentry && !simple_empty(dentry))
3027			return -ENOTEMPTY;
3028	}
3029
3030	return 0;
3031}
3032#else
3033static int shmem_inode_casefold_flags(struct inode *inode, unsigned int fsflags,
3034				      struct dentry *dentry, unsigned int *i_flags)
3035{
3036	if (fsflags & FS_CASEFOLD_FL)
3037		return -EOPNOTSUPP;
3038
3039	return 0;
3040}
3041#endif
3042
3043/*
3044 * chattr's fsflags are unrelated to extended attributes,
3045 * but tmpfs has chosen to enable them under the same config option.
3046 */
3047static int shmem_set_inode_flags(struct inode *inode, unsigned int fsflags, struct dentry *dentry)
3048{
3049	unsigned int i_flags = 0;
3050	int ret;
3051
3052	ret = shmem_inode_casefold_flags(inode, fsflags, dentry, &i_flags);
3053	if (ret)
3054		return ret;
3055
3056	if (fsflags & FS_NOATIME_FL)
3057		i_flags |= S_NOATIME;
3058	if (fsflags & FS_APPEND_FL)
3059		i_flags |= S_APPEND;
3060	if (fsflags & FS_IMMUTABLE_FL)
3061		i_flags |= S_IMMUTABLE;
3062	/*
3063	 * But FS_NODUMP_FL does not require any action in i_flags.
3064	 */
3065	inode_set_flags(inode, i_flags, S_NOATIME | S_APPEND | S_IMMUTABLE | S_CASEFOLD);
3066
3067	return 0;
3068}
3069#else
3070static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags, struct dentry *dentry)
3071{
3072}
3073#define shmem_initxattrs NULL
3074#endif
3075
3076static struct offset_ctx *shmem_get_offset_ctx(struct inode *inode)
3077{
3078	return &SHMEM_I(inode)->dir_offsets;
3079}
3080
3081static struct inode *__shmem_get_inode(struct mnt_idmap *idmap,
3082					     struct super_block *sb,
3083					     struct inode *dir, umode_t mode,
3084					     dev_t dev, unsigned long flags)
3085{
3086	struct inode *inode;
3087	struct shmem_inode_info *info;
3088	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
3089	ino_t ino;
3090	int err;
3091
3092	err = shmem_reserve_inode(sb, &ino);
3093	if (err)
3094		return ERR_PTR(err);
3095
3096	inode = new_inode(sb);
3097	if (!inode) {
3098		shmem_free_inode(sb, 0);
3099		return ERR_PTR(-ENOSPC);
3100	}
3101
3102	inode->i_ino = ino;
3103	inode_init_owner(idmap, inode, dir, mode);
3104	inode->i_blocks = 0;
3105	simple_inode_init_ts(inode);
3106	inode->i_generation = get_random_u32();
3107	info = SHMEM_I(inode);
3108	memset(info, 0, (char *)inode - (char *)info);
3109	spin_lock_init(&info->lock);
3110	atomic_set(&info->stop_eviction, 0);
3111	info->seals = F_SEAL_SEAL;
3112	info->flags = (flags & VM_NORESERVE) ? SHMEM_F_NORESERVE : 0;
3113	info->i_crtime = inode_get_mtime(inode);
3114	info->fsflags = (dir == NULL) ? 0 :
3115		SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED;
3116	if (info->fsflags)
3117		shmem_set_inode_flags(inode, info->fsflags, NULL);
3118	INIT_LIST_HEAD(&info->shrinklist);
3119	INIT_LIST_HEAD(&info->swaplist);
3120	simple_xattrs_init(&info->xattrs);
3121	cache_no_acl(inode);
3122	if (sbinfo->noswap)
3123		mapping_set_unevictable(inode->i_mapping);
3124
3125	/* Don't consider 'deny' for emergencies and 'force' for testing */
3126	if (sbinfo->huge)
3127		mapping_set_large_folios(inode->i_mapping);
3128
3129	switch (mode & S_IFMT) {
3130	default:
3131		inode->i_op = &shmem_special_inode_operations;
3132		init_special_inode(inode, mode, dev);
3133		break;
3134	case S_IFREG:
3135		inode->i_mapping->a_ops = &shmem_aops;
3136		inode->i_op = &shmem_inode_operations;
3137		inode->i_fop = &shmem_file_operations;
3138		mpol_shared_policy_init(&info->policy,
3139					 shmem_get_sbmpol(sbinfo));
3140		break;
3141	case S_IFDIR:
3142		inc_nlink(inode);
3143		/* Some things misbehave if size == 0 on a directory */
3144		inode->i_size = 2 * BOGO_DIRENT_SIZE;
3145		inode->i_op = &shmem_dir_inode_operations;
3146		inode->i_fop = &simple_offset_dir_operations;
3147		simple_offset_init(shmem_get_offset_ctx(inode));
3148		break;
3149	case S_IFLNK:
3150		/*
3151		 * Must not load anything in the rbtree,
3152		 * mpol_free_shared_policy will not be called.
3153		 */
3154		mpol_shared_policy_init(&info->policy, NULL);
3155		break;
3156	}
3157
3158	lockdep_annotate_inode_mutex_key(inode);
3159	return inode;
3160}
3161
3162#ifdef CONFIG_TMPFS_QUOTA
3163static struct inode *shmem_get_inode(struct mnt_idmap *idmap,
3164				     struct super_block *sb, struct inode *dir,
3165				     umode_t mode, dev_t dev, unsigned long flags)
3166{
3167	int err;
3168	struct inode *inode;
3169
3170	inode = __shmem_get_inode(idmap, sb, dir, mode, dev, flags);
3171	if (IS_ERR(inode))
3172		return inode;
3173
3174	err = dquot_initialize(inode);
3175	if (err)
3176		goto errout;
3177
3178	err = dquot_alloc_inode(inode);
3179	if (err) {
3180		dquot_drop(inode);
3181		goto errout;
3182	}
3183	return inode;
3184
3185errout:
3186	inode->i_flags |= S_NOQUOTA;
3187	iput(inode);
3188	return ERR_PTR(err);
3189}
3190#else
3191static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap,
3192				     struct super_block *sb, struct inode *dir,
3193				     umode_t mode, dev_t dev, unsigned long flags)
3194{
3195	return __shmem_get_inode(idmap, sb, dir, mode, dev, flags);
3196}
3197#endif /* CONFIG_TMPFS_QUOTA */
3198
3199#ifdef CONFIG_USERFAULTFD
3200int shmem_mfill_atomic_pte(pmd_t *dst_pmd,
3201			   struct vm_area_struct *dst_vma,
3202			   unsigned long dst_addr,
3203			   unsigned long src_addr,
3204			   uffd_flags_t flags,
3205			   struct folio **foliop)
3206{
3207	struct inode *inode = file_inode(dst_vma->vm_file);
3208	struct shmem_inode_info *info = SHMEM_I(inode);
3209	struct address_space *mapping = inode->i_mapping;
3210	gfp_t gfp = mapping_gfp_mask(mapping);
3211	pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
3212	void *page_kaddr;
3213	struct folio *folio;
3214	int ret;
3215	pgoff_t max_off;
3216
3217	if (shmem_inode_acct_blocks(inode, 1)) {
3218		/*
3219		 * We may have got a page, returned -ENOENT triggering a retry,
3220		 * and now we find ourselves with -ENOMEM. Release the page, to
3221		 * avoid a BUG_ON in our caller.
3222		 */
3223		if (unlikely(*foliop)) {
3224			folio_put(*foliop);
3225			*foliop = NULL;
3226		}
3227		return -ENOMEM;
3228	}
3229
3230	if (!*foliop) {
3231		ret = -ENOMEM;
3232		folio = shmem_alloc_folio(gfp, 0, info, pgoff);
3233		if (!folio)
3234			goto out_unacct_blocks;
3235
3236		if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) {
3237			page_kaddr = kmap_local_folio(folio, 0);
3238			/*
3239			 * The read mmap_lock is held here.  Despite the
3240			 * mmap_lock being read recursive a deadlock is still
3241			 * possible if a writer has taken a lock.  For example:
3242			 *
3243			 * process A thread 1 takes read lock on own mmap_lock
3244			 * process A thread 2 calls mmap, blocks taking write lock
3245			 * process B thread 1 takes page fault, read lock on own mmap lock
3246			 * process B thread 2 calls mmap, blocks taking write lock
3247			 * process A thread 1 blocks taking read lock on process B
3248			 * process B thread 1 blocks taking read lock on process A
3249			 *
3250			 * Disable page faults to prevent potential deadlock
3251			 * and retry the copy outside the mmap_lock.
3252			 */
3253			pagefault_disable();
3254			ret = copy_from_user(page_kaddr,
3255					     (const void __user *)src_addr,
3256					     PAGE_SIZE);
3257			pagefault_enable();
3258			kunmap_local(page_kaddr);
3259
3260			/* fallback to copy_from_user outside mmap_lock */
3261			if (unlikely(ret)) {
3262				*foliop = folio;
3263				ret = -ENOENT;
3264				/* don't free the page */
3265				goto out_unacct_blocks;
3266			}
3267
3268			flush_dcache_folio(folio);
3269		} else {		/* ZEROPAGE */
3270			clear_user_highpage(&folio->page, dst_addr);
3271		}
3272	} else {
3273		folio = *foliop;
3274		VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
3275		*foliop = NULL;
3276	}
3277
3278	VM_BUG_ON(folio_test_locked(folio));
3279	VM_BUG_ON(folio_test_swapbacked(folio));
3280	__folio_set_locked(folio);
3281	__folio_set_swapbacked(folio);
3282	__folio_mark_uptodate(folio);
3283
3284	ret = -EFAULT;
3285	max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
3286	if (unlikely(pgoff >= max_off))
3287		goto out_release;
3288
3289	ret = mem_cgroup_charge(folio, dst_vma->vm_mm, gfp);
3290	if (ret)
3291		goto out_release;
3292	ret = shmem_add_to_page_cache(folio, mapping, pgoff, NULL, gfp);
3293	if (ret)
3294		goto out_release;
3295
3296	ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
3297				       &folio->page, true, flags);
3298	if (ret)
3299		goto out_delete_from_cache;
3300
3301	shmem_recalc_inode(inode, 1, 0);
3302	folio_unlock(folio);
3303	return 0;
3304out_delete_from_cache:
3305	filemap_remove_folio(folio);
3306out_release:
3307	folio_unlock(folio);
3308	folio_put(folio);
3309out_unacct_blocks:
3310	shmem_inode_unacct_blocks(inode, 1);
3311	return ret;
3312}
3313#endif /* CONFIG_USERFAULTFD */
3314
3315#ifdef CONFIG_TMPFS
3316static const struct inode_operations shmem_symlink_inode_operations;
3317static const struct inode_operations shmem_short_symlink_operations;
3318
3319static int
3320shmem_write_begin(const struct kiocb *iocb, struct address_space *mapping,
3321		  loff_t pos, unsigned len,
3322		  struct folio **foliop, void **fsdata)
3323{
3324	struct inode *inode = mapping->host;
3325	struct shmem_inode_info *info = SHMEM_I(inode);
3326	pgoff_t index = pos >> PAGE_SHIFT;
3327	struct folio *folio;
3328	int ret = 0;
3329
3330	/* i_rwsem is held by caller */
3331	if (unlikely(info->seals & (F_SEAL_GROW |
3332				   F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))) {
3333		if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))
3334			return -EPERM;
3335		if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size)
3336			return -EPERM;
3337	}
3338
3339	if (unlikely((info->flags & SHMEM_F_MAPPING_FROZEN) &&
3340		     pos + len > inode->i_size))
3341		return -EPERM;
3342
3343	ret = shmem_get_folio(inode, index, pos + len, &folio, SGP_WRITE);
3344	if (ret)
3345		return ret;
3346
3347	if (folio_contain_hwpoisoned_page(folio)) {
3348		folio_unlock(folio);
3349		folio_put(folio);
3350		return -EIO;
3351	}
3352
3353	*foliop = folio;
3354	return 0;
3355}
3356
3357static int
3358shmem_write_end(const struct kiocb *iocb, struct address_space *mapping,
3359		loff_t pos, unsigned len, unsigned copied,
3360		struct folio *folio, void *fsdata)
3361{
3362	struct inode *inode = mapping->host;
3363
3364	if (pos + copied > inode->i_size)
3365		i_size_write(inode, pos + copied);
3366
3367	if (!folio_test_uptodate(folio)) {
3368		if (copied < folio_size(folio)) {
3369			size_t from = offset_in_folio(folio, pos);
3370			folio_zero_segments(folio, 0, from,
3371					from + copied, folio_size(folio));
3372		}
3373		folio_mark_uptodate(folio);
3374	}
3375	folio_mark_dirty(folio);
3376	folio_unlock(folio);
3377	folio_put(folio);
3378
3379	return copied;
3380}
3381
3382static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
3383{
3384	struct file *file = iocb->ki_filp;
3385	struct inode *inode = file_inode(file);
3386	struct address_space *mapping = inode->i_mapping;
3387	pgoff_t index;
3388	unsigned long offset;
3389	int error = 0;
3390	ssize_t retval = 0;
3391
3392	for (;;) {
3393		struct folio *folio = NULL;
3394		struct page *page = NULL;
3395		unsigned long nr, ret;
3396		loff_t end_offset, i_size = i_size_read(inode);
3397		bool fallback_page_copy = false;
3398		size_t fsize;
3399
3400		if (unlikely(iocb->ki_pos >= i_size))
3401			break;
3402
3403		index = iocb->ki_pos >> PAGE_SHIFT;
3404		error = shmem_get_folio(inode, index, 0, &folio, SGP_READ);
3405		if (error) {
3406			if (error == -EINVAL)
3407				error = 0;
3408			break;
3409		}
3410		if (folio) {
3411			folio_unlock(folio);
3412
3413			page = folio_file_page(folio, index);
3414			if (PageHWPoison(page)) {
3415				folio_put(folio);
3416				error = -EIO;
3417				break;
3418			}
3419
3420			if (folio_test_large(folio) &&
3421			    folio_test_has_hwpoisoned(folio))
3422				fallback_page_copy = true;
3423		}
3424
3425		/*
3426		 * We must evaluate after, since reads (unlike writes)
3427		 * are called without i_rwsem protection against truncate
3428		 */
3429		i_size = i_size_read(inode);
3430		if (unlikely(iocb->ki_pos >= i_size)) {
3431			if (folio)
3432				folio_put(folio);
3433			break;
3434		}
3435		end_offset = min_t(loff_t, i_size, iocb->ki_pos + to->count);
3436		if (folio && likely(!fallback_page_copy))
3437			fsize = folio_size(folio);
3438		else
3439			fsize = PAGE_SIZE;
3440		offset = iocb->ki_pos & (fsize - 1);
3441		nr = min_t(loff_t, end_offset - iocb->ki_pos, fsize - offset);
3442
3443		if (folio) {
3444			/*
3445			 * If users can be writing to this page using arbitrary
3446			 * virtual addresses, take care about potential aliasing
3447			 * before reading the page on the kernel side.
3448			 */
3449			if (mapping_writably_mapped(mapping)) {
3450				if (likely(!fallback_page_copy))
3451					flush_dcache_folio(folio);
3452				else
3453					flush_dcache_page(page);
3454			}
3455
3456			/*
3457			 * Mark the folio accessed if we read the beginning.
3458			 */
3459			if (!offset)
3460				folio_mark_accessed(folio);
3461			/*
3462			 * Ok, we have the page, and it's up-to-date, so
3463			 * now we can copy it to user space...
3464			 */
3465			if (likely(!fallback_page_copy))
3466				ret = copy_folio_to_iter(folio, offset, nr, to);
3467			else
3468				ret = copy_page_to_iter(page, offset, nr, to);
3469			folio_put(folio);
3470		} else if (user_backed_iter(to)) {
3471			/*
3472			 * Copy to user tends to be so well optimized, but
3473			 * clear_user() not so much, that it is noticeably
3474			 * faster to copy the zero page instead of clearing.
3475			 */
3476			ret = copy_page_to_iter(ZERO_PAGE(0), offset, nr, to);
3477		} else {
3478			/*
3479			 * But submitting the same page twice in a row to
3480			 * splice() - or others? - can result in confusion:
3481			 * so don't attempt that optimization on pipes etc.
3482			 */
3483			ret = iov_iter_zero(nr, to);
3484		}
3485
3486		retval += ret;
3487		iocb->ki_pos += ret;
3488
3489		if (!iov_iter_count(to))
3490			break;
3491		if (ret < nr) {
3492			error = -EFAULT;
3493			break;
3494		}
3495		cond_resched();
3496	}
3497
3498	file_accessed(file);
3499	return retval ? retval : error;
3500}
3501
3502static ssize_t shmem_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
3503{
3504	struct file *file = iocb->ki_filp;
3505	struct inode *inode = file->f_mapping->host;
3506	ssize_t ret;
3507
3508	inode_lock(inode);
3509	ret = generic_write_checks(iocb, from);
3510	if (ret <= 0)
3511		goto unlock;
3512	ret = file_remove_privs(file);
3513	if (ret)
3514		goto unlock;
3515	ret = file_update_time(file);
3516	if (ret)
3517		goto unlock;
3518	ret = generic_perform_write(iocb, from);
3519unlock:
3520	inode_unlock(inode);
3521	return ret;
3522}
3523
3524static bool zero_pipe_buf_get(struct pipe_inode_info *pipe,
3525			      struct pipe_buffer *buf)
3526{
3527	return true;
3528}
3529
3530static void zero_pipe_buf_release(struct pipe_inode_info *pipe,
3531				  struct pipe_buffer *buf)
3532{
3533}
3534
3535static bool zero_pipe_buf_try_steal(struct pipe_inode_info *pipe,
3536				    struct pipe_buffer *buf)
3537{
3538	return false;
3539}
3540
3541static const struct pipe_buf_operations zero_pipe_buf_ops = {
3542	.release	= zero_pipe_buf_release,
3543	.try_steal	= zero_pipe_buf_try_steal,
3544	.get		= zero_pipe_buf_get,
3545};
3546
3547static size_t splice_zeropage_into_pipe(struct pipe_inode_info *pipe,
3548					loff_t fpos, size_t size)
3549{
3550	size_t offset = fpos & ~PAGE_MASK;
3551
3552	size = min_t(size_t, size, PAGE_SIZE - offset);
3553
3554	if (!pipe_is_full(pipe)) {
3555		struct pipe_buffer *buf = pipe_head_buf(pipe);
3556
3557		*buf = (struct pipe_buffer) {
3558			.ops	= &zero_pipe_buf_ops,
3559			.page	= ZERO_PAGE(0),
3560			.offset	= offset,
3561			.len	= size,
3562		};
3563		pipe->head++;
3564	}
3565
3566	return size;
3567}
3568
3569static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
3570				      struct pipe_inode_info *pipe,
3571				      size_t len, unsigned int flags)
3572{
3573	struct inode *inode = file_inode(in);
3574	struct address_space *mapping = inode->i_mapping;
3575	struct folio *folio = NULL;
3576	size_t total_spliced = 0, used, npages, n, part;
3577	loff_t isize;
3578	int error = 0;
3579
3580	/* Work out how much data we can actually add into the pipe */
3581	used = pipe_buf_usage(pipe);
3582	npages = max_t(ssize_t, pipe->max_usage - used, 0);
3583	len = min_t(size_t, len, npages * PAGE_SIZE);
3584
3585	do {
3586		bool fallback_page_splice = false;
3587		struct page *page = NULL;
3588		pgoff_t index;
3589		size_t size;
3590
3591		if (*ppos >= i_size_read(inode))
3592			break;
3593
3594		index = *ppos >> PAGE_SHIFT;
3595		error = shmem_get_folio(inode, index, 0, &folio, SGP_READ);
3596		if (error) {
3597			if (error == -EINVAL)
3598				error = 0;
3599			break;
3600		}
3601		if (folio) {
3602			folio_unlock(folio);
3603
3604			page = folio_file_page(folio, index);
3605			if (PageHWPoison(page)) {
3606				error = -EIO;
3607				break;
3608			}
3609
3610			if (folio_test_large(folio) &&
3611			    folio_test_has_hwpoisoned(folio))
3612				fallback_page_splice = true;
3613		}
3614
3615		/*
3616		 * i_size must be checked after we know the pages are Uptodate.
3617		 *
3618		 * Checking i_size after the check allows us to calculate
3619		 * the correct value for "nr", which means the zero-filled
3620		 * part of the page is not copied back to userspace (unless
3621		 * another truncate extends the file - this is desired though).
3622		 */
3623		isize = i_size_read(inode);
3624		if (unlikely(*ppos >= isize))
3625			break;
3626		/*
3627		 * Fallback to PAGE_SIZE splice if the large folio has hwpoisoned
3628		 * pages.
3629		 */
3630		size = len;
3631		if (unlikely(fallback_page_splice)) {
3632			size_t offset = *ppos & ~PAGE_MASK;
3633
3634			size = umin(size, PAGE_SIZE - offset);
3635		}
3636		part = min_t(loff_t, isize - *ppos, size);
3637
3638		if (folio) {
3639			/*
3640			 * If users can be writing to this page using arbitrary
3641			 * virtual addresses, take care about potential aliasing
3642			 * before reading the page on the kernel side.
3643			 */
3644			if (mapping_writably_mapped(mapping)) {
3645				if (likely(!fallback_page_splice))
3646					flush_dcache_folio(folio);
3647				else
3648					flush_dcache_page(page);
3649			}
3650			folio_mark_accessed(folio);
3651			/*
3652			 * Ok, we have the page, and it's up-to-date, so we can
3653			 * now splice it into the pipe.
3654			 */
3655			n = splice_folio_into_pipe(pipe, folio, *ppos, part);
3656			folio_put(folio);
3657			folio = NULL;
3658		} else {
3659			n = splice_zeropage_into_pipe(pipe, *ppos, part);
3660		}
3661
3662		if (!n)
3663			break;
3664		len -= n;
3665		total_spliced += n;
3666		*ppos += n;
3667		in->f_ra.prev_pos = *ppos;
3668		if (pipe_is_full(pipe))
3669			break;
3670
3671		cond_resched();
3672	} while (len);
3673
3674	if (folio)
3675		folio_put(folio);
3676
3677	file_accessed(in);
3678	return total_spliced ? total_spliced : error;
3679}
3680
3681static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
3682{
3683	struct address_space *mapping = file->f_mapping;
3684	struct inode *inode = mapping->host;
3685
3686	if (whence != SEEK_DATA && whence != SEEK_HOLE)
3687		return generic_file_llseek_size(file, offset, whence,
3688					MAX_LFS_FILESIZE, i_size_read(inode));
3689	if (offset < 0)
3690		return -ENXIO;
3691
3692	inode_lock(inode);
3693	/* We're holding i_rwsem so we can access i_size directly */
3694	offset = mapping_seek_hole_data(mapping, offset, inode->i_size, whence);
3695	if (offset >= 0)
3696		offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE);
3697	inode_unlock(inode);
3698	return offset;
3699}
3700
3701static long shmem_fallocate(struct file *file, int mode, loff_t offset,
3702							 loff_t len)
3703{
3704	struct inode *inode = file_inode(file);
3705	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
3706	struct shmem_inode_info *info = SHMEM_I(inode);
3707	struct shmem_falloc shmem_falloc;
3708	pgoff_t start, index, end, undo_fallocend;
3709	int error;
3710
3711	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
3712		return -EOPNOTSUPP;
3713
3714	inode_lock(inode);
3715
3716	if (info->flags & SHMEM_F_MAPPING_FROZEN) {
3717		error = -EPERM;
3718		goto out;
3719	}
3720
3721	if (mode & FALLOC_FL_PUNCH_HOLE) {
3722		struct address_space *mapping = file->f_mapping;
3723		loff_t unmap_start = round_up(offset, PAGE_SIZE);
3724		loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1;
3725		DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq);
3726
3727		/* protected by i_rwsem */
3728		if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
3729			error = -EPERM;
3730			goto out;
3731		}
3732
3733		shmem_falloc.waitq = &shmem_falloc_waitq;
3734		shmem_falloc.start = (u64)unmap_start >> PAGE_SHIFT;
3735		shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT;
3736		spin_lock(&inode->i_lock);
3737		inode->i_private = &shmem_falloc;
3738		spin_unlock(&inode->i_lock);
3739
3740		if ((u64)unmap_end > (u64)unmap_start)
3741			unmap_mapping_range(mapping, unmap_start,
3742					    1 + unmap_end - unmap_start, 0);
3743		shmem_truncate_range(inode, offset, offset + len - 1);
3744		/* No need to unmap again: hole-punching leaves COWed pages */
3745
3746		spin_lock(&inode->i_lock);
3747		inode->i_private = NULL;
3748		wake_up_all(&shmem_falloc_waitq);
3749		WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.head));
3750		spin_unlock(&inode->i_lock);
3751		error = 0;
3752		goto out;
3753	}
3754
3755	/* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
3756	error = inode_newsize_ok(inode, offset + len);
3757	if (error)
3758		goto out;
3759
3760	if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) {
3761		error = -EPERM;
3762		goto out;
3763	}
3764
3765	start = offset >> PAGE_SHIFT;
3766	end = (offset + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
3767	/* Try to avoid a swapstorm if len is impossible to satisfy */
3768	if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) {
3769		error = -ENOSPC;
3770		goto out;
3771	}
3772
3773	shmem_falloc.waitq = NULL;
3774	shmem_falloc.start = start;
3775	shmem_falloc.next  = start;
3776	shmem_falloc.nr_falloced = 0;
3777	shmem_falloc.nr_unswapped = 0;
3778	spin_lock(&inode->i_lock);
3779	inode->i_private = &shmem_falloc;
3780	spin_unlock(&inode->i_lock);
3781
3782	/*
3783	 * info->fallocend is only relevant when huge pages might be
3784	 * involved: to prevent split_huge_page() freeing fallocated
3785	 * pages when FALLOC_FL_KEEP_SIZE committed beyond i_size.
3786	 */
3787	undo_fallocend = info->fallocend;
3788	if (info->fallocend < end)
3789		info->fallocend = end;
3790
3791	for (index = start; index < end; ) {
3792		struct folio *folio;
3793
3794		/*
3795		 * Check for fatal signal so that we abort early in OOM
3796		 * situations. We don't want to abort in case of non-fatal
3797		 * signals as large fallocate can take noticeable time and
3798		 * e.g. periodic timers may result in fallocate constantly
3799		 * restarting.
3800		 */
3801		if (fatal_signal_pending(current))
3802			error = -EINTR;
3803		else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced)
3804			error = -ENOMEM;
3805		else
3806			error = shmem_get_folio(inode, index, offset + len,
3807						&folio, SGP_FALLOC);
3808		if (error) {
3809			info->fallocend = undo_fallocend;
3810			/* Remove the !uptodate folios we added */
3811			if (index > start) {
3812				shmem_undo_range(inode,
3813				    (loff_t)start << PAGE_SHIFT,
3814				    ((loff_t)index << PAGE_SHIFT) - 1, true);
3815			}
3816			goto undone;
3817		}
3818
3819		/*
3820		 * Here is a more important optimization than it appears:
3821		 * a second SGP_FALLOC on the same large folio will clear it,
3822		 * making it uptodate and un-undoable if we fail later.
3823		 */
3824		index = folio_next_index(folio);
3825		/* Beware 32-bit wraparound */
3826		if (!index)
3827			index--;
3828
3829		/*
3830		 * Inform shmem_writeout() how far we have reached.
3831		 * No need for lock or barrier: we have the page lock.
3832		 */
3833		if (!folio_test_uptodate(folio))
3834			shmem_falloc.nr_falloced += index - shmem_falloc.next;
3835		shmem_falloc.next = index;
3836
3837		/*
3838		 * If !uptodate, leave it that way so that freeable folios
3839		 * can be recognized if we need to rollback on error later.
3840		 * But mark it dirty so that memory pressure will swap rather
3841		 * than free the folios we are allocating (and SGP_CACHE folios
3842		 * might still be clean: we now need to mark those dirty too).
3843		 */
3844		folio_mark_dirty(folio);
3845		folio_unlock(folio);
3846		folio_put(folio);
3847		cond_resched();
3848	}
3849
3850	if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
3851		i_size_write(inode, offset + len);
3852undone:
3853	spin_lock(&inode->i_lock);
3854	inode->i_private = NULL;
3855	spin_unlock(&inode->i_lock);
3856out:
3857	if (!error)
3858		file_modified(file);
3859	inode_unlock(inode);
3860	return error;
3861}
3862
3863static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
3864{
3865	struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
3866
3867	buf->f_type = TMPFS_MAGIC;
3868	buf->f_bsize = PAGE_SIZE;
3869	buf->f_namelen = NAME_MAX;
3870	if (sbinfo->max_blocks) {
3871		buf->f_blocks = sbinfo->max_blocks;
3872		buf->f_bavail =
3873		buf->f_bfree  = sbinfo->max_blocks -
3874				percpu_counter_sum(&sbinfo->used_blocks);
3875	}
3876	if (sbinfo->max_inodes) {
3877		buf->f_files = sbinfo->max_inodes;
3878		buf->f_ffree = sbinfo->free_ispace / BOGO_INODE_SIZE;
3879	}
3880	/* else leave those fields 0 like simple_statfs */
3881
3882	buf->f_fsid = uuid_to_fsid(dentry->d_sb->s_uuid.b);
3883
3884	return 0;
3885}
3886
3887/*
3888 * File creation. Allocate an inode, and we're done..
3889 */
3890static int
3891shmem_mknod(struct mnt_idmap *idmap, struct inode *dir,
3892	    struct dentry *dentry, umode_t mode, dev_t dev)
3893{
3894	struct inode *inode;
3895	int error;
3896
3897	if (!generic_ci_validate_strict_name(dir, &dentry->d_name))
3898		return -EINVAL;
3899
3900	inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, dev, VM_NORESERVE);
3901	if (IS_ERR(inode))
3902		return PTR_ERR(inode);
3903
3904	error = simple_acl_create(dir, inode);
3905	if (error)
3906		goto out_iput;
3907	error = security_inode_init_security(inode, dir, &dentry->d_name,
3908					     shmem_initxattrs, NULL);
3909	if (error && error != -EOPNOTSUPP)
3910		goto out_iput;
3911
3912	error = simple_offset_add(shmem_get_offset_ctx(dir), dentry);
3913	if (error)
3914		goto out_iput;
3915
3916	dir->i_size += BOGO_DIRENT_SIZE;
3917	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
3918	inode_inc_iversion(dir);
3919
3920	d_make_persistent(dentry, inode);
3921	return error;
3922
3923out_iput:
3924	iput(inode);
3925	return error;
3926}
3927
3928static int
3929shmem_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
3930	      struct file *file, umode_t mode)
3931{
3932	struct inode *inode;
3933	int error;
3934
3935	inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, 0, VM_NORESERVE);
3936	if (IS_ERR(inode)) {
3937		error = PTR_ERR(inode);
3938		goto err_out;
3939	}
3940	error = security_inode_init_security(inode, dir, NULL,
3941					     shmem_initxattrs, NULL);
3942	if (error && error != -EOPNOTSUPP)
3943		goto out_iput;
3944	error = simple_acl_create(dir, inode);
3945	if (error)
3946		goto out_iput;
3947	d_tmpfile(file, inode);
3948
3949err_out:
3950	return finish_open_simple(file, error);
3951out_iput:
3952	iput(inode);
3953	return error;
3954}
3955
3956static struct dentry *shmem_mkdir(struct mnt_idmap *idmap, struct inode *dir,
3957				  struct dentry *dentry, umode_t mode)
3958{
3959	int error;
3960
3961	error = shmem_mknod(idmap, dir, dentry, mode | S_IFDIR, 0);
3962	if (error)
3963		return ERR_PTR(error);
3964	inc_nlink(dir);
3965	return NULL;
3966}
3967
3968static int shmem_create(struct mnt_idmap *idmap, struct inode *dir,
3969			struct dentry *dentry, umode_t mode, bool excl)
3970{
3971	return shmem_mknod(idmap, dir, dentry, mode | S_IFREG, 0);
3972}
3973
3974/*
3975 * Link a file..
3976 */
3977static int shmem_link(struct dentry *old_dentry, struct inode *dir,
3978		      struct dentry *dentry)
3979{
3980	struct inode *inode = d_inode(old_dentry);
3981	int ret;
3982
3983	/*
3984	 * No ordinary (disk based) filesystem counts links as inodes;
3985	 * but each new link needs a new dentry, pinning lowmem, and
3986	 * tmpfs dentries cannot be pruned until they are unlinked.
3987	 * But if an O_TMPFILE file is linked into the tmpfs, the
3988	 * first link must skip that, to get the accounting right.
3989	 */
3990	if (inode->i_nlink) {
3991		ret = shmem_reserve_inode(inode->i_sb, NULL);
3992		if (ret)
3993			return ret;
3994	}
3995
3996	ret = simple_offset_add(shmem_get_offset_ctx(dir), dentry);
3997	if (ret) {
3998		if (inode->i_nlink)
3999			shmem_free_inode(inode->i_sb, 0);
4000		return ret;
4001	}
4002
4003	dir->i_size += BOGO_DIRENT_SIZE;
4004	inode_inc_iversion(dir);
4005	return simple_link(old_dentry, dir, dentry);
4006}
4007
4008static int shmem_unlink(struct inode *dir, struct dentry *dentry)
4009{
4010	struct inode *inode = d_inode(dentry);
4011
4012	if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode))
4013		shmem_free_inode(inode->i_sb, 0);
4014
4015	simple_offset_remove(shmem_get_offset_ctx(dir), dentry);
4016
4017	dir->i_size -= BOGO_DIRENT_SIZE;
4018	inode_inc_iversion(dir);
4019	simple_unlink(dir, dentry);
4020
4021	/*
4022	 * For now, VFS can't deal with case-insensitive negative dentries, so
4023	 * we invalidate them
4024	 */
4025	if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir))
4026		d_invalidate(dentry);
4027
4028	return 0;
4029}
4030
4031static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
4032{
4033	if (!simple_empty(dentry))
4034		return -ENOTEMPTY;
4035
4036	drop_nlink(d_inode(dentry));
4037	drop_nlink(dir);
4038	return shmem_unlink(dir, dentry);
4039}
4040
4041static int shmem_whiteout(struct mnt_idmap *idmap,
4042			  struct inode *old_dir, struct dentry *old_dentry)
4043{
4044	struct dentry *whiteout;
4045	int error;
4046
4047	whiteout = d_alloc(old_dentry->d_parent, &old_dentry->d_name);
4048	if (!whiteout)
4049		return -ENOMEM;
4050	error = shmem_mknod(idmap, old_dir, whiteout,
4051			    S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV);
4052	dput(whiteout);
4053	return error;
4054}
4055
4056/*
4057 * The VFS layer already does all the dentry stuff for rename,
4058 * we just have to decrement the usage count for the target if
4059 * it exists so that the VFS layer correctly free's it when it
4060 * gets overwritten.
4061 */
4062static int shmem_rename2(struct mnt_idmap *idmap,
4063			 struct inode *old_dir, struct dentry *old_dentry,
4064			 struct inode *new_dir, struct dentry *new_dentry,
4065			 unsigned int flags)
4066{
4067	struct inode *inode = d_inode(old_dentry);
4068	int they_are_dirs = S_ISDIR(inode->i_mode);
4069	bool had_offset = false;
4070	int error;
4071
4072	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
4073		return -EINVAL;
4074
4075	if (flags & RENAME_EXCHANGE)
4076		return simple_offset_rename_exchange(old_dir, old_dentry,
4077						     new_dir, new_dentry);
4078
4079	if (!simple_empty(new_dentry))
4080		return -ENOTEMPTY;
4081
4082	error = simple_offset_add(shmem_get_offset_ctx(new_dir), new_dentry);
4083	if (error == -EBUSY)
4084		had_offset = true;
4085	else if (unlikely(error))
4086		return error;
4087
4088	if (flags & RENAME_WHITEOUT) {
4089		error = shmem_whiteout(idmap, old_dir, old_dentry);
4090		if (error) {
4091			if (!had_offset)
4092				simple_offset_remove(shmem_get_offset_ctx(new_dir),
4093						     new_dentry);
4094			return error;
4095		}
4096	}
4097
4098	simple_offset_rename(old_dir, old_dentry, new_dir, new_dentry);
4099	if (d_really_is_positive(new_dentry)) {
4100		(void) shmem_unlink(new_dir, new_dentry);
4101		if (they_are_dirs) {
4102			drop_nlink(d_inode(new_dentry));
4103			drop_nlink(old_dir);
4104		}
4105	} else if (they_are_dirs) {
4106		drop_nlink(old_dir);
4107		inc_nlink(new_dir);
4108	}
4109
4110	old_dir->i_size -= BOGO_DIRENT_SIZE;
4111	new_dir->i_size += BOGO_DIRENT_SIZE;
4112	simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
4113	inode_inc_iversion(old_dir);
4114	inode_inc_iversion(new_dir);
4115	return 0;
4116}
4117
4118static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir,
4119			 struct dentry *dentry, const char *symname)
4120{
4121	int error;
4122	int len;
4123	struct inode *inode;
4124	struct folio *folio;
4125	char *link;
4126
4127	len = strlen(symname) + 1;
4128	if (len > PAGE_SIZE)
4129		return -ENAMETOOLONG;
4130
4131	inode = shmem_get_inode(idmap, dir->i_sb, dir, S_IFLNK | 0777, 0,
4132				VM_NORESERVE);
4133	if (IS_ERR(inode))
4134		return PTR_ERR(inode);
4135
4136	error = security_inode_init_security(inode, dir, &dentry->d_name,
4137					     shmem_initxattrs, NULL);
4138	if (error && error != -EOPNOTSUPP)
4139		goto out_iput;
4140
4141	error = simple_offset_add(shmem_get_offset_ctx(dir), dentry);
4142	if (error)
4143		goto out_iput;
4144
4145	inode->i_size = len-1;
4146	if (len <= SHORT_SYMLINK_LEN) {
4147		link = kmemdup(symname, len, GFP_KERNEL);
4148		if (!link) {
4149			error = -ENOMEM;
4150			goto out_remove_offset;
4151		}
4152		inode->i_op = &shmem_short_symlink_operations;
4153		inode_set_cached_link(inode, link, len - 1);
4154	} else {
4155		inode_nohighmem(inode);
4156		inode->i_mapping->a_ops = &shmem_aops;
4157		error = shmem_get_folio(inode, 0, 0, &folio, SGP_WRITE);
4158		if (error)
4159			goto out_remove_offset;
4160		inode->i_op = &shmem_symlink_inode_operations;
4161		memcpy(folio_address(folio), symname, len);
4162		folio_mark_uptodate(folio);
4163		folio_mark_dirty(folio);
4164		folio_unlock(folio);
4165		folio_put(folio);
4166	}
4167	dir->i_size += BOGO_DIRENT_SIZE;
4168	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
4169	inode_inc_iversion(dir);
4170	d_make_persistent(dentry, inode);
4171	return 0;
4172
4173out_remove_offset:
4174	simple_offset_remove(shmem_get_offset_ctx(dir), dentry);
4175out_iput:
4176	iput(inode);
4177	return error;
4178}
4179
4180static void shmem_put_link(void *arg)
4181{
4182	folio_mark_accessed(arg);
4183	folio_put(arg);
4184}
4185
4186static const char *shmem_get_link(struct dentry *dentry, struct inode *inode,
4187				  struct delayed_call *done)
4188{
4189	struct folio *folio = NULL;
4190	int error;
4191
4192	if (!dentry) {
4193		folio = filemap_get_folio(inode->i_mapping, 0);
4194		if (IS_ERR(folio))
4195			return ERR_PTR(-ECHILD);
4196		if (PageHWPoison(folio_page(folio, 0)) ||
4197		    !folio_test_uptodate(folio)) {
4198			folio_put(folio);
4199			return ERR_PTR(-ECHILD);
4200		}
4201	} else {
4202		error = shmem_get_folio(inode, 0, 0, &folio, SGP_READ);
4203		if (error)
4204			return ERR_PTR(error);
4205		if (!folio)
4206			return ERR_PTR(-ECHILD);
4207		if (PageHWPoison(folio_page(folio, 0))) {
4208			folio_unlock(folio);
4209			folio_put(folio);
4210			return ERR_PTR(-ECHILD);
4211		}
4212		folio_unlock(folio);
4213	}
4214	set_delayed_call(done, shmem_put_link, folio);
4215	return folio_address(folio);
4216}
4217
4218#ifdef CONFIG_TMPFS_XATTR
4219
4220static int shmem_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
4221{
4222	struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
4223
4224	fileattr_fill_flags(fa, info->fsflags & SHMEM_FL_USER_VISIBLE);
4225
4226	return 0;
4227}
4228
4229static int shmem_fileattr_set(struct mnt_idmap *idmap,
4230			      struct dentry *dentry, struct file_kattr *fa)
4231{
4232	struct inode *inode = d_inode(dentry);
4233	struct shmem_inode_info *info = SHMEM_I(inode);
4234	int ret, flags;
4235
4236	if (fileattr_has_fsx(fa))
4237		return -EOPNOTSUPP;
4238	if (fa->flags & ~SHMEM_FL_USER_MODIFIABLE)
4239		return -EOPNOTSUPP;
4240
4241	flags = (info->fsflags & ~SHMEM_FL_USER_MODIFIABLE) |
4242		(fa->flags & SHMEM_FL_USER_MODIFIABLE);
4243
4244	ret = shmem_set_inode_flags(inode, flags, dentry);
4245
4246	if (ret)
4247		return ret;
4248
4249	info->fsflags = flags;
4250
4251	inode_set_ctime_current(inode);
4252	inode_inc_iversion(inode);
4253	return 0;
4254}
4255
4256/*
4257 * Superblocks without xattr inode operations may get some security.* xattr
4258 * support from the LSM "for free". As soon as we have any other xattrs
4259 * like ACLs, we also need to implement the security.* handlers at
4260 * filesystem level, though.
4261 */
4262
4263/*
4264 * Callback for security_inode_init_security() for acquiring xattrs.
4265 */
4266static int shmem_initxattrs(struct inode *inode,
4267			    const struct xattr *xattr_array, void *fs_info)
4268{
4269	struct shmem_inode_info *info = SHMEM_I(inode);
4270	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
4271	const struct xattr *xattr;
4272	struct simple_xattr *new_xattr;
4273	size_t ispace = 0;
4274	size_t len;
4275
4276	if (sbinfo->max_inodes) {
4277		for (xattr = xattr_array; xattr->name != NULL; xattr++) {
4278			ispace += simple_xattr_space(xattr->name,
4279				xattr->value_len + XATTR_SECURITY_PREFIX_LEN);
4280		}
4281		if (ispace) {
4282			raw_spin_lock(&sbinfo->stat_lock);
4283			if (sbinfo->free_ispace < ispace)
4284				ispace = 0;
4285			else
4286				sbinfo->free_ispace -= ispace;
4287			raw_spin_unlock(&sbinfo->stat_lock);
4288			if (!ispace)
4289				return -ENOSPC;
4290		}
4291	}
4292
4293	for (xattr = xattr_array; xattr->name != NULL; xattr++) {
4294		new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len);
4295		if (!new_xattr)
4296			break;
4297
4298		len = strlen(xattr->name) + 1;
4299		new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len,
4300					  GFP_KERNEL_ACCOUNT);
4301		if (!new_xattr->name) {
4302			kvfree(new_xattr);
4303			break;
4304		}
4305
4306		memcpy(new_xattr->name, XATTR_SECURITY_PREFIX,
4307		       XATTR_SECURITY_PREFIX_LEN);
4308		memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN,
4309		       xattr->name, len);
4310
4311		simple_xattr_add(&info->xattrs, new_xattr);
4312	}
4313
4314	if (xattr->name != NULL) {
4315		if (ispace) {
4316			raw_spin_lock(&sbinfo->stat_lock);
4317			sbinfo->free_ispace += ispace;
4318			raw_spin_unlock(&sbinfo->stat_lock);
4319		}
4320		simple_xattrs_free(&info->xattrs, NULL);
4321		return -ENOMEM;
4322	}
4323
4324	return 0;
4325}
4326
4327static int shmem_xattr_handler_get(const struct xattr_handler *handler,
4328				   struct dentry *unused, struct inode *inode,
4329				   const char *name, void *buffer, size_t size)
4330{
4331	struct shmem_inode_info *info = SHMEM_I(inode);
4332
4333	name = xattr_full_name(handler, name);
4334	return simple_xattr_get(&info->xattrs, name, buffer, size);
4335}
4336
4337static int shmem_xattr_handler_set(const struct xattr_handler *handler,
4338				   struct mnt_idmap *idmap,
4339				   struct dentry *unused, struct inode *inode,
4340				   const char *name, const void *value,
4341				   size_t size, int flags)
4342{
4343	struct shmem_inode_info *info = SHMEM_I(inode);
4344	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
4345	struct simple_xattr *old_xattr;
4346	size_t ispace = 0;
4347
4348	name = xattr_full_name(handler, name);
4349	if (value && sbinfo->max_inodes) {
4350		ispace = simple_xattr_space(name, size);
4351		raw_spin_lock(&sbinfo->stat_lock);
4352		if (sbinfo->free_ispace < ispace)
4353			ispace = 0;
4354		else
4355			sbinfo->free_ispace -= ispace;
4356		raw_spin_unlock(&sbinfo->stat_lock);
4357		if (!ispace)
4358			return -ENOSPC;
4359	}
4360
4361	old_xattr = simple_xattr_set(&info->xattrs, name, value, size, flags);
4362	if (!IS_ERR(old_xattr)) {
4363		ispace = 0;
4364		if (old_xattr && sbinfo->max_inodes)
4365			ispace = simple_xattr_space(old_xattr->name,
4366						    old_xattr->size);
4367		simple_xattr_free(old_xattr);
4368		old_xattr = NULL;
4369		inode_set_ctime_current(inode);
4370		inode_inc_iversion(inode);
4371	}
4372	if (ispace) {
4373		raw_spin_lock(&sbinfo->stat_lock);
4374		sbinfo->free_ispace += ispace;
4375		raw_spin_unlock(&sbinfo->stat_lock);
4376	}
4377	return PTR_ERR(old_xattr);
4378}
4379
4380static const struct xattr_handler shmem_security_xattr_handler = {
4381	.prefix = XATTR_SECURITY_PREFIX,
4382	.get = shmem_xattr_handler_get,
4383	.set = shmem_xattr_handler_set,
4384};
4385
4386static const struct xattr_handler shmem_trusted_xattr_handler = {
4387	.prefix = XATTR_TRUSTED_PREFIX,
4388	.get = shmem_xattr_handler_get,
4389	.set = shmem_xattr_handler_set,
4390};
4391
4392static const struct xattr_handler shmem_user_xattr_handler = {
4393	.prefix = XATTR_USER_PREFIX,
4394	.get = shmem_xattr_handler_get,
4395	.set = shmem_xattr_handler_set,
4396};
4397
4398static const struct xattr_handler * const shmem_xattr_handlers[] = {
4399	&shmem_security_xattr_handler,
4400	&shmem_trusted_xattr_handler,
4401	&shmem_user_xattr_handler,
4402	NULL
4403};
4404
4405static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size)
4406{
4407	struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
4408	return simple_xattr_list(d_inode(dentry), &info->xattrs, buffer, size);
4409}
4410#endif /* CONFIG_TMPFS_XATTR */
4411
4412static const struct inode_operations shmem_short_symlink_operations = {
4413	.getattr	= shmem_getattr,
4414	.setattr	= shmem_setattr,
4415	.get_link	= simple_get_link,
4416#ifdef CONFIG_TMPFS_XATTR
4417	.listxattr	= shmem_listxattr,
4418#endif
4419};
4420
4421static const struct inode_operations shmem_symlink_inode_operations = {
4422	.getattr	= shmem_getattr,
4423	.setattr	= shmem_setattr,
4424	.get_link	= shmem_get_link,
4425#ifdef CONFIG_TMPFS_XATTR
4426	.listxattr	= shmem_listxattr,
4427#endif
4428};
4429
4430static struct dentry *shmem_get_parent(struct dentry *child)
4431{
4432	return ERR_PTR(-ESTALE);
4433}
4434
4435static int shmem_match(struct inode *ino, void *vfh)
4436{
4437	__u32 *fh = vfh;
4438	__u64 inum = fh[2];
4439	inum = (inum << 32) | fh[1];
4440	return ino->i_ino == inum && fh[0] == ino->i_generation;
4441}
4442
4443/* Find any alias of inode, but prefer a hashed alias */
4444static struct dentry *shmem_find_alias(struct inode *inode)
4445{
4446	struct dentry *alias = d_find_alias(inode);
4447
4448	return alias ?: d_find_any_alias(inode);
4449}
4450
4451static struct dentry *shmem_fh_to_dentry(struct super_block *sb,
4452		struct fid *fid, int fh_len, int fh_type)
4453{
4454	struct inode *inode;
4455	struct dentry *dentry = NULL;
4456	u64 inum;
4457
4458	if (fh_len < 3)
4459		return NULL;
4460
4461	inum = fid->raw[2];
4462	inum = (inum << 32) | fid->raw[1];
4463
4464	inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]),
4465			shmem_match, fid->raw);
4466	if (inode) {
4467		dentry = shmem_find_alias(inode);
4468		iput(inode);
4469	}
4470
4471	return dentry;
4472}
4473
4474static int shmem_encode_fh(struct inode *inode, __u32 *fh, int *len,
4475				struct inode *parent)
4476{
4477	if (*len < 3) {
4478		*len = 3;
4479		return FILEID_INVALID;
4480	}
4481
4482	if (inode_unhashed(inode)) {
4483		/* Unfortunately insert_inode_hash is not idempotent,
4484		 * so as we hash inodes here rather than at creation
4485		 * time, we need a lock to ensure we only try
4486		 * to do it once
4487		 */
4488		static DEFINE_SPINLOCK(lock);
4489		spin_lock(&lock);
4490		if (inode_unhashed(inode))
4491			__insert_inode_hash(inode,
4492					    inode->i_ino + inode->i_generation);
4493		spin_unlock(&lock);
4494	}
4495
4496	fh[0] = inode->i_generation;
4497	fh[1] = inode->i_ino;
4498	fh[2] = ((__u64)inode->i_ino) >> 32;
4499
4500	*len = 3;
4501	return 1;
4502}
4503
4504static const struct export_operations shmem_export_ops = {
4505	.get_parent     = shmem_get_parent,
4506	.encode_fh      = shmem_encode_fh,
4507	.fh_to_dentry	= shmem_fh_to_dentry,
4508};
4509
4510enum shmem_param {
4511	Opt_gid,
4512	Opt_huge,
4513	Opt_mode,
4514	Opt_mpol,
4515	Opt_nr_blocks,
4516	Opt_nr_inodes,
4517	Opt_size,
4518	Opt_uid,
4519	Opt_inode32,
4520	Opt_inode64,
4521	Opt_noswap,
4522	Opt_quota,
4523	Opt_usrquota,
4524	Opt_grpquota,
4525	Opt_usrquota_block_hardlimit,
4526	Opt_usrquota_inode_hardlimit,
4527	Opt_grpquota_block_hardlimit,
4528	Opt_grpquota_inode_hardlimit,
4529	Opt_casefold_version,
4530	Opt_casefold,
4531	Opt_strict_encoding,
4532};
4533
4534static const struct constant_table shmem_param_enums_huge[] = {
4535	{"never",	SHMEM_HUGE_NEVER },
4536	{"always",	SHMEM_HUGE_ALWAYS },
4537	{"within_size",	SHMEM_HUGE_WITHIN_SIZE },
4538	{"advise",	SHMEM_HUGE_ADVISE },
4539	{}
4540};
4541
4542const struct fs_parameter_spec shmem_fs_parameters[] = {
4543	fsparam_gid   ("gid",		Opt_gid),
4544	fsparam_enum  ("huge",		Opt_huge,  shmem_param_enums_huge),
4545	fsparam_u32oct("mode",		Opt_mode),
4546	fsparam_string("mpol",		Opt_mpol),
4547	fsparam_string("nr_blocks",	Opt_nr_blocks),
4548	fsparam_string("nr_inodes",	Opt_nr_inodes),
4549	fsparam_string("size",		Opt_size),
4550	fsparam_uid   ("uid",		Opt_uid),
4551	fsparam_flag  ("inode32",	Opt_inode32),
4552	fsparam_flag  ("inode64",	Opt_inode64),
4553	fsparam_flag  ("noswap",	Opt_noswap),
4554#ifdef CONFIG_TMPFS_QUOTA
4555	fsparam_flag  ("quota",		Opt_quota),
4556	fsparam_flag  ("usrquota",	Opt_usrquota),
4557	fsparam_flag  ("grpquota",	Opt_grpquota),
4558	fsparam_string("usrquota_block_hardlimit", Opt_usrquota_block_hardlimit),
4559	fsparam_string("usrquota_inode_hardlimit", Opt_usrquota_inode_hardlimit),
4560	fsparam_string("grpquota_block_hardlimit", Opt_grpquota_block_hardlimit),
4561	fsparam_string("grpquota_inode_hardlimit", Opt_grpquota_inode_hardlimit),
4562#endif
4563	fsparam_string("casefold",	Opt_casefold_version),
4564	fsparam_flag  ("casefold",	Opt_casefold),
4565	fsparam_flag  ("strict_encoding", Opt_strict_encoding),
4566	{}
4567};
4568
4569#if IS_ENABLED(CONFIG_UNICODE)
4570static int shmem_parse_opt_casefold(struct fs_context *fc, struct fs_parameter *param,
4571				    bool latest_version)
4572{
4573	struct shmem_options *ctx = fc->fs_private;
4574	int version = UTF8_LATEST;
4575	struct unicode_map *encoding;
4576	char *version_str = param->string + 5;
4577
4578	if (!latest_version) {
4579		if (strncmp(param->string, "utf8-", 5))
4580			return invalfc(fc, "Only UTF-8 encodings are supported "
4581				       "in the format: utf8-<version number>");
4582
4583		version = utf8_parse_version(version_str);
4584		if (version < 0)
4585			return invalfc(fc, "Invalid UTF-8 version: %s", version_str);
4586	}
4587
4588	encoding = utf8_load(version);
4589
4590	if (IS_ERR(encoding)) {
4591		return invalfc(fc, "Failed loading UTF-8 version: utf8-%u.%u.%u\n",
4592			       unicode_major(version), unicode_minor(version),
4593			       unicode_rev(version));
4594	}
4595
4596	pr_info("tmpfs: Using encoding : utf8-%u.%u.%u\n",
4597		unicode_major(version), unicode_minor(version), unicode_rev(version));
4598
4599	ctx->encoding = encoding;
4600
4601	return 0;
4602}
4603#else
4604static int shmem_parse_opt_casefold(struct fs_context *fc, struct fs_parameter *param,
4605				    bool latest_version)
4606{
4607	return invalfc(fc, "tmpfs: Kernel not built with CONFIG_UNICODE\n");
4608}
4609#endif
4610
4611static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
4612{
4613	struct shmem_options *ctx = fc->fs_private;
4614	struct fs_parse_result result;
4615	unsigned long long size;
4616	char *rest;
4617	int opt;
4618	kuid_t kuid;
4619	kgid_t kgid;
4620
4621	opt = fs_parse(fc, shmem_fs_parameters, param, &result);
4622	if (opt < 0)
4623		return opt;
4624
4625	switch (opt) {
4626	case Opt_size:
4627		size = memparse(param->string, &rest);
4628		if (*rest == '%') {
4629			size <<= PAGE_SHIFT;
4630			size *= totalram_pages();
4631			do_div(size, 100);
4632			rest++;
4633		}
4634		if (*rest)
4635			goto bad_value;
4636		ctx->blocks = DIV_ROUND_UP(size, PAGE_SIZE);
4637		ctx->seen |= SHMEM_SEEN_BLOCKS;
4638		break;
4639	case Opt_nr_blocks:
4640		ctx->blocks = memparse(param->string, &rest);
4641		if (*rest || ctx->blocks > LONG_MAX)
4642			goto bad_value;
4643		ctx->seen |= SHMEM_SEEN_BLOCKS;
4644		break;
4645	case Opt_nr_inodes:
4646		ctx->inodes = memparse(param->string, &rest);
4647		if (*rest || ctx->inodes > ULONG_MAX / BOGO_INODE_SIZE)
4648			goto bad_value;
4649		ctx->seen |= SHMEM_SEEN_INODES;
4650		break;
4651	case Opt_mode:
4652		ctx->mode = result.uint_32 & 07777;
4653		break;
4654	case Opt_uid:
4655		kuid = result.uid;
4656
4657		/*
4658		 * The requested uid must be representable in the
4659		 * filesystem's idmapping.
4660		 */
4661		if (!kuid_has_mapping(fc->user_ns, kuid))
4662			goto bad_value;
4663
4664		ctx->uid = kuid;
4665		break;
4666	case Opt_gid:
4667		kgid = result.gid;
4668
4669		/*
4670		 * The requested gid must be representable in the
4671		 * filesystem's idmapping.
4672		 */
4673		if (!kgid_has_mapping(fc->user_ns, kgid))
4674			goto bad_value;
4675
4676		ctx->gid = kgid;
4677		break;
4678	case Opt_huge:
4679		ctx->huge = result.uint_32;
4680		if (ctx->huge != SHMEM_HUGE_NEVER &&
4681		    !(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
4682		      has_transparent_hugepage()))
4683			goto unsupported_parameter;
4684		ctx->seen |= SHMEM_SEEN_HUGE;
4685		break;
4686	case Opt_mpol:
4687		if (IS_ENABLED(CONFIG_NUMA)) {
4688			mpol_put(ctx->mpol);
4689			ctx->mpol = NULL;
4690			if (mpol_parse_str(param->string, &ctx->mpol))
4691				goto bad_value;
4692			break;
4693		}
4694		goto unsupported_parameter;
4695	case Opt_inode32:
4696		ctx->full_inums = false;
4697		ctx->seen |= SHMEM_SEEN_INUMS;
4698		break;
4699	case Opt_inode64:
4700		if (sizeof(ino_t) < 8) {
4701			return invalfc(fc,
4702				       "Cannot use inode64 with <64bit inums in kernel\n");
4703		}
4704		ctx->full_inums = true;
4705		ctx->seen |= SHMEM_SEEN_INUMS;
4706		break;
4707	case Opt_noswap:
4708		if ((fc->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN)) {
4709			return invalfc(fc,
4710				       "Turning off swap in unprivileged tmpfs mounts unsupported");
4711		}
4712		ctx->noswap = true;
4713		break;
4714	case Opt_quota:
4715		if (fc->user_ns != &init_user_ns)
4716			return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported");
4717		ctx->seen |= SHMEM_SEEN_QUOTA;
4718		ctx->quota_types |= (QTYPE_MASK_USR | QTYPE_MASK_GRP);
4719		break;
4720	case Opt_usrquota:
4721		if (fc->user_ns != &init_user_ns)
4722			return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported");
4723		ctx->seen |= SHMEM_SEEN_QUOTA;
4724		ctx->quota_types |= QTYPE_MASK_USR;
4725		break;
4726	case Opt_grpquota:
4727		if (fc->user_ns != &init_user_ns)
4728			return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported");
4729		ctx->seen |= SHMEM_SEEN_QUOTA;
4730		ctx->quota_types |= QTYPE_MASK_GRP;
4731		break;
4732	case Opt_usrquota_block_hardlimit:
4733		size = memparse(param->string, &rest);
4734		if (*rest || !size)
4735			goto bad_value;
4736		if (size > SHMEM_QUOTA_MAX_SPC_LIMIT)
4737			return invalfc(fc,
4738				       "User quota block hardlimit too large.");
4739		ctx->qlimits.usrquota_bhardlimit = size;
4740		break;
4741	case Opt_grpquota_block_hardlimit:
4742		size = memparse(param->string, &rest);
4743		if (*rest || !size)
4744			goto bad_value;
4745		if (size > SHMEM_QUOTA_MAX_SPC_LIMIT)
4746			return invalfc(fc,
4747				       "Group quota block hardlimit too large.");
4748		ctx->qlimits.grpquota_bhardlimit = size;
4749		break;
4750	case Opt_usrquota_inode_hardlimit:
4751		size = memparse(param->string, &rest);
4752		if (*rest || !size)
4753			goto bad_value;
4754		if (size > SHMEM_QUOTA_MAX_INO_LIMIT)
4755			return invalfc(fc,
4756				       "User quota inode hardlimit too large.");
4757		ctx->qlimits.usrquota_ihardlimit = size;
4758		break;
4759	case Opt_grpquota_inode_hardlimit:
4760		size = memparse(param->string, &rest);
4761		if (*rest || !size)
4762			goto bad_value;
4763		if (size > SHMEM_QUOTA_MAX_INO_LIMIT)
4764			return invalfc(fc,
4765				       "Group quota inode hardlimit too large.");
4766		ctx->qlimits.grpquota_ihardlimit = size;
4767		break;
4768	case Opt_casefold_version:
4769		return shmem_parse_opt_casefold(fc, param, false);
4770	case Opt_casefold:
4771		return shmem_parse_opt_casefold(fc, param, true);
4772	case Opt_strict_encoding:
4773#if IS_ENABLED(CONFIG_UNICODE)
4774		ctx->strict_encoding = true;
4775		break;
4776#else
4777		return invalfc(fc, "tmpfs: Kernel not built with CONFIG_UNICODE\n");
4778#endif
4779	}
4780	return 0;
4781
4782unsupported_parameter:
4783	return invalfc(fc, "Unsupported parameter '%s'", param->key);
4784bad_value:
4785	return invalfc(fc, "Bad value for '%s'", param->key);
4786}
4787
4788static char *shmem_next_opt(char **s)
4789{
4790	char *sbegin = *s;
4791	char *p;
4792
4793	if (sbegin == NULL)
4794		return NULL;
4795
4796	/*
4797	 * NUL-terminate this option: unfortunately,
4798	 * mount options form a comma-separated list,
4799	 * but mpol's nodelist may also contain commas.
4800	 */
4801	for (;;) {
4802		p = strchr(*s, ',');
4803		if (p == NULL)
4804			break;
4805		*s = p + 1;
4806		if (!isdigit(*(p+1))) {
4807			*p = '\0';
4808			return sbegin;
4809		}
4810	}
4811
4812	*s = NULL;
4813	return sbegin;
4814}
4815
4816static int shmem_parse_monolithic(struct fs_context *fc, void *data)
4817{
4818	return vfs_parse_monolithic_sep(fc, data, shmem_next_opt);
4819}
4820
4821/*
4822 * Reconfigure a shmem filesystem.
4823 */
4824static int shmem_reconfigure(struct fs_context *fc)
4825{
4826	struct shmem_options *ctx = fc->fs_private;
4827	struct shmem_sb_info *sbinfo = SHMEM_SB(fc->root->d_sb);
4828	unsigned long used_isp;
4829	struct mempolicy *mpol = NULL;
4830	const char *err;
4831
4832	raw_spin_lock(&sbinfo->stat_lock);
4833	used_isp = sbinfo->max_inodes * BOGO_INODE_SIZE - sbinfo->free_ispace;
4834
4835	if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) {
4836		if (!sbinfo->max_blocks) {
4837			err = "Cannot retroactively limit size";
4838			goto out;
4839		}
4840		if (percpu_counter_compare(&sbinfo->used_blocks,
4841					   ctx->blocks) > 0) {
4842			err = "Too small a size for current use";
4843			goto out;
4844		}
4845	}
4846	if ((ctx->seen & SHMEM_SEEN_INODES) && ctx->inodes) {
4847		if (!sbinfo->max_inodes) {
4848			err = "Cannot retroactively limit inodes";
4849			goto out;
4850		}
4851		if (ctx->inodes * BOGO_INODE_SIZE < used_isp) {
4852			err = "Too few inodes for current use";
4853			goto out;
4854		}
4855	}
4856
4857	if ((ctx->seen & SHMEM_SEEN_INUMS) && !ctx->full_inums &&
4858	    sbinfo->next_ino > UINT_MAX) {
4859		err = "Current inum too high to switch to 32-bit inums";
4860		goto out;
4861	}
4862
4863	/*
4864	 * "noswap" doesn't use fsparam_flag_no, i.e. there's no "swap"
4865	 * counterpart for (re-)enabling swap.
4866	 */
4867	if (ctx->noswap && !sbinfo->noswap) {
4868		err = "Cannot disable swap on remount";
4869		goto out;
4870	}
4871
4872	if (ctx->seen & SHMEM_SEEN_QUOTA &&
4873	    !sb_any_quota_loaded(fc->root->d_sb)) {
4874		err = "Cannot enable quota on remount";
4875		goto out;
4876	}
4877
4878#ifdef CONFIG_TMPFS_QUOTA
4879#define CHANGED_LIMIT(name)						\
4880	(ctx->qlimits.name## hardlimit &&				\
4881	(ctx->qlimits.name## hardlimit != sbinfo->qlimits.name## hardlimit))
4882
4883	if (CHANGED_LIMIT(usrquota_b) || CHANGED_LIMIT(usrquota_i) ||
4884	    CHANGED_LIMIT(grpquota_b) || CHANGED_LIMIT(grpquota_i)) {
4885		err = "Cannot change global quota limit on remount";
4886		goto out;
4887	}
4888#endif /* CONFIG_TMPFS_QUOTA */
4889
4890	if (ctx->seen & SHMEM_SEEN_HUGE)
4891		sbinfo->huge = ctx->huge;
4892	if (ctx->seen & SHMEM_SEEN_INUMS)
4893		sbinfo->full_inums = ctx->full_inums;
4894	if (ctx->seen & SHMEM_SEEN_BLOCKS)
4895		sbinfo->max_blocks  = ctx->blocks;
4896	if (ctx->seen & SHMEM_SEEN_INODES) {
4897		sbinfo->max_inodes  = ctx->inodes;
4898		sbinfo->free_ispace = ctx->inodes * BOGO_INODE_SIZE - used_isp;
4899	}
4900
4901	/*
4902	 * Preserve previous mempolicy unless mpol remount option was specified.
4903	 */
4904	if (ctx->mpol) {
4905		mpol = sbinfo->mpol;
4906		sbinfo->mpol = ctx->mpol;	/* transfers initial ref */
4907		ctx->mpol = NULL;
4908	}
4909
4910	if (ctx->noswap)
4911		sbinfo->noswap = true;
4912
4913	raw_spin_unlock(&sbinfo->stat_lock);
4914	mpol_put(mpol);
4915	return 0;
4916out:
4917	raw_spin_unlock(&sbinfo->stat_lock);
4918	return invalfc(fc, "%s", err);
4919}
4920
4921static int shmem_show_options(struct seq_file *seq, struct dentry *root)
4922{
4923	struct shmem_sb_info *sbinfo = SHMEM_SB(root->d_sb);
4924	struct mempolicy *mpol;
4925
4926	if (sbinfo->max_blocks != shmem_default_max_blocks())
4927		seq_printf(seq, ",size=%luk", K(sbinfo->max_blocks));
4928	if (sbinfo->max_inodes != shmem_default_max_inodes())
4929		seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes);
4930	if (sbinfo->mode != (0777 | S_ISVTX))
4931		seq_printf(seq, ",mode=%03ho", sbinfo->mode);
4932	if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID))
4933		seq_printf(seq, ",uid=%u",
4934				from_kuid_munged(&init_user_ns, sbinfo->uid));
4935	if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID))
4936		seq_printf(seq, ",gid=%u",
4937				from_kgid_munged(&init_user_ns, sbinfo->gid));
4938
4939	/*
4940	 * Showing inode{64,32} might be useful even if it's the system default,
4941	 * since then people don't have to resort to checking both here and
4942	 * /proc/config.gz to confirm 64-bit inums were successfully applied
4943	 * (which may not even exist if IKCONFIG_PROC isn't enabled).
4944	 *
4945	 * We hide it when inode64 isn't the default and we are using 32-bit
4946	 * inodes, since that probably just means the feature isn't even under
4947	 * consideration.
4948	 *
4949	 * As such:
4950	 *
4951	 *                     +-----------------+-----------------+
4952	 *                     | TMPFS_INODE64=y | TMPFS_INODE64=n |
4953	 *  +------------------+-----------------+-----------------+
4954	 *  | full_inums=true  | show            | show            |
4955	 *  | full_inums=false | show            | hide            |
4956	 *  +------------------+-----------------+-----------------+
4957	 *
4958	 */
4959	if (IS_ENABLED(CONFIG_TMPFS_INODE64) || sbinfo->full_inums)
4960		seq_printf(seq, ",inode%d", (sbinfo->full_inums ? 64 : 32));
4961#ifdef CONFIG_TRANSPARENT_HUGEPAGE
4962	/* Rightly or wrongly, show huge mount option unmasked by shmem_huge */
4963	if (sbinfo->huge)
4964		seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge));
4965#endif
4966	mpol = shmem_get_sbmpol(sbinfo);
4967	shmem_show_mpol(seq, mpol);
4968	mpol_put(mpol);
4969	if (sbinfo->noswap)
4970		seq_printf(seq, ",noswap");
4971#ifdef CONFIG_TMPFS_QUOTA
4972	if (sb_has_quota_active(root->d_sb, USRQUOTA))
4973		seq_printf(seq, ",usrquota");
4974	if (sb_has_quota_active(root->d_sb, GRPQUOTA))
4975		seq_printf(seq, ",grpquota");
4976	if (sbinfo->qlimits.usrquota_bhardlimit)
4977		seq_printf(seq, ",usrquota_block_hardlimit=%lld",
4978			   sbinfo->qlimits.usrquota_bhardlimit);
4979	if (sbinfo->qlimits.grpquota_bhardlimit)
4980		seq_printf(seq, ",grpquota_block_hardlimit=%lld",
4981			   sbinfo->qlimits.grpquota_bhardlimit);
4982	if (sbinfo->qlimits.usrquota_ihardlimit)
4983		seq_printf(seq, ",usrquota_inode_hardlimit=%lld",
4984			   sbinfo->qlimits.usrquota_ihardlimit);
4985	if (sbinfo->qlimits.grpquota_ihardlimit)
4986		seq_printf(seq, ",grpquota_inode_hardlimit=%lld",
4987			   sbinfo->qlimits.grpquota_ihardlimit);
4988#endif
4989	return 0;
4990}
4991
4992#endif /* CONFIG_TMPFS */
4993
4994static void shmem_put_super(struct super_block *sb)
4995{
4996	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
4997
4998#if IS_ENABLED(CONFIG_UNICODE)
4999	if (sb->s_encoding)
5000		utf8_unload(sb->s_encoding);
5001#endif
5002
5003#ifdef CONFIG_TMPFS_QUOTA
5004	shmem_disable_quotas(sb);
5005#endif
5006	free_percpu(sbinfo->ino_batch);
5007	percpu_counter_destroy(&sbinfo->used_blocks);
5008	mpol_put(sbinfo->mpol);
5009	kfree(sbinfo);
5010	sb->s_fs_info = NULL;
5011}
5012
5013#if IS_ENABLED(CONFIG_UNICODE) && defined(CONFIG_TMPFS)
5014static const struct dentry_operations shmem_ci_dentry_ops = {
5015	.d_hash = generic_ci_d_hash,
5016	.d_compare = generic_ci_d_compare,
5017};
5018#endif
5019
5020static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
5021{
5022	struct shmem_options *ctx = fc->fs_private;
5023	struct inode *inode;
5024	struct shmem_sb_info *sbinfo;
5025	int error = -ENOMEM;
5026
5027	/* Round up to L1_CACHE_BYTES to resist false sharing */
5028	sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info),
5029				L1_CACHE_BYTES), GFP_KERNEL);
5030	if (!sbinfo)
5031		return error;
5032
5033	sb->s_fs_info = sbinfo;
5034
5035#ifdef CONFIG_TMPFS
5036	/*
5037	 * Per default we only allow half of the physical ram per
5038	 * tmpfs instance, limiting inodes to one per page of lowmem;
5039	 * but the internal instance is left unlimited.
5040	 */
5041	if (!(sb->s_flags & SB_KERNMOUNT)) {
5042		if (!(ctx->seen & SHMEM_SEEN_BLOCKS))
5043			ctx->blocks = shmem_default_max_blocks();
5044		if (!(ctx->seen & SHMEM_SEEN_INODES))
5045			ctx->inodes = shmem_default_max_inodes();
5046		if (!(ctx->seen & SHMEM_SEEN_INUMS))
5047			ctx->full_inums = IS_ENABLED(CONFIG_TMPFS_INODE64);
5048		sbinfo->noswap = ctx->noswap;
5049	} else {
5050		sb->s_flags |= SB_NOUSER;
5051	}
5052	sb->s_export_op = &shmem_export_ops;
5053	sb->s_flags |= SB_NOSEC;
5054
5055#if IS_ENABLED(CONFIG_UNICODE)
5056	if (!ctx->encoding && ctx->strict_encoding) {
5057		pr_err("tmpfs: strict_encoding option without encoding is forbidden\n");
5058		error = -EINVAL;
5059		goto failed;
5060	}
5061
5062	if (ctx->encoding) {
5063		sb->s_encoding = ctx->encoding;
5064		set_default_d_op(sb, &shmem_ci_dentry_ops);
5065		if (ctx->strict_encoding)
5066			sb->s_encoding_flags = SB_ENC_STRICT_MODE_FL;
5067	}
5068#endif
5069
5070#else
5071	sb->s_flags |= SB_NOUSER;
5072#endif /* CONFIG_TMPFS */
5073	sb->s_d_flags |= DCACHE_DONTCACHE;
5074	sbinfo->max_blocks = ctx->blocks;
5075	sbinfo->max_inodes = ctx->inodes;
5076	sbinfo->free_ispace = sbinfo->max_inodes * BOGO_INODE_SIZE;
5077	if (sb->s_flags & SB_KERNMOUNT) {
5078		sbinfo->ino_batch = alloc_percpu(ino_t);
5079		if (!sbinfo->ino_batch)
5080			goto failed;
5081	}
5082	sbinfo->uid = ctx->uid;
5083	sbinfo->gid = ctx->gid;
5084	sbinfo->full_inums = ctx->full_inums;
5085	sbinfo->mode = ctx->mode;
5086#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5087	if (ctx->seen & SHMEM_SEEN_HUGE)
5088		sbinfo->huge = ctx->huge;
5089	else
5090		sbinfo->huge = tmpfs_huge;
5091#endif
5092	sbinfo->mpol = ctx->mpol;
5093	ctx->mpol = NULL;
5094
5095	raw_spin_lock_init(&sbinfo->stat_lock);
5096	if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL))
5097		goto failed;
5098	spin_lock_init(&sbinfo->shrinklist_lock);
5099	INIT_LIST_HEAD(&sbinfo->shrinklist);
5100
5101	sb->s_maxbytes = MAX_LFS_FILESIZE;
5102	sb->s_blocksize = PAGE_SIZE;
5103	sb->s_blocksize_bits = PAGE_SHIFT;
5104	sb->s_magic = TMPFS_MAGIC;
5105	sb->s_op = &shmem_ops;
5106	sb->s_time_gran = 1;
5107#ifdef CONFIG_TMPFS_XATTR
5108	sb->s_xattr = shmem_xattr_handlers;
5109#endif
5110#ifdef CONFIG_TMPFS_POSIX_ACL
5111	sb->s_flags |= SB_POSIXACL;
5112#endif
5113	uuid_t uuid;
5114	uuid_gen(&uuid);
5115	super_set_uuid(sb, uuid.b, sizeof(uuid));
5116
5117#ifdef CONFIG_TMPFS_QUOTA
5118	if (ctx->seen & SHMEM_SEEN_QUOTA) {
5119		sb->dq_op = &shmem_quota_operations;
5120		sb->s_qcop = &dquot_quotactl_sysfile_ops;
5121		sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
5122
5123		/* Copy the default limits from ctx into sbinfo */
5124		memcpy(&sbinfo->qlimits, &ctx->qlimits,
5125		       sizeof(struct shmem_quota_limits));
5126
5127		if (shmem_enable_quotas(sb, ctx->quota_types))
5128			goto failed;
5129	}
5130#endif /* CONFIG_TMPFS_QUOTA */
5131
5132	inode = shmem_get_inode(&nop_mnt_idmap, sb, NULL,
5133				S_IFDIR | sbinfo->mode, 0, VM_NORESERVE);
5134	if (IS_ERR(inode)) {
5135		error = PTR_ERR(inode);
5136		goto failed;
5137	}
5138	inode->i_uid = sbinfo->uid;
5139	inode->i_gid = sbinfo->gid;
5140	sb->s_root = d_make_root(inode);
5141	if (!sb->s_root)
5142		goto failed;
5143	return 0;
5144
5145failed:
5146	shmem_put_super(sb);
5147	return error;
5148}
5149
5150static int shmem_get_tree(struct fs_context *fc)
5151{
5152	return get_tree_nodev(fc, shmem_fill_super);
5153}
5154
5155static void shmem_free_fc(struct fs_context *fc)
5156{
5157	struct shmem_options *ctx = fc->fs_private;
5158
5159	if (ctx) {
5160		mpol_put(ctx->mpol);
5161		kfree(ctx);
5162	}
5163}
5164
5165static const struct fs_context_operations shmem_fs_context_ops = {
5166	.free			= shmem_free_fc,
5167	.get_tree		= shmem_get_tree,
5168#ifdef CONFIG_TMPFS
5169	.parse_monolithic	= shmem_parse_monolithic,
5170	.parse_param		= shmem_parse_one,
5171	.reconfigure		= shmem_reconfigure,
5172#endif
5173};
5174
5175static struct kmem_cache *shmem_inode_cachep __ro_after_init;
5176
5177static struct inode *shmem_alloc_inode(struct super_block *sb)
5178{
5179	struct shmem_inode_info *info;
5180	info = alloc_inode_sb(sb, shmem_inode_cachep, GFP_KERNEL);
5181	if (!info)
5182		return NULL;
5183	return &info->vfs_inode;
5184}
5185
5186static void shmem_free_in_core_inode(struct inode *inode)
5187{
5188	if (S_ISLNK(inode->i_mode))
5189		kfree(inode->i_link);
5190	kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
5191}
5192
5193static void shmem_destroy_inode(struct inode *inode)
5194{
5195	if (S_ISREG(inode->i_mode))
5196		mpol_free_shared_policy(&SHMEM_I(inode)->policy);
5197	if (S_ISDIR(inode->i_mode))
5198		simple_offset_destroy(shmem_get_offset_ctx(inode));
5199}
5200
5201static void shmem_init_inode(void *foo)
5202{
5203	struct shmem_inode_info *info = foo;
5204	inode_init_once(&info->vfs_inode);
5205}
5206
5207static void __init shmem_init_inodecache(void)
5208{
5209	shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
5210				sizeof(struct shmem_inode_info),
5211				0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode);
5212}
5213
5214static void __init shmem_destroy_inodecache(void)
5215{
5216	kmem_cache_destroy(shmem_inode_cachep);
5217}
5218
5219/* Keep the page in page cache instead of truncating it */
5220static int shmem_error_remove_folio(struct address_space *mapping,
5221				   struct folio *folio)
5222{
5223	return 0;
5224}
5225
5226static const struct address_space_operations shmem_aops = {
5227	.dirty_folio	= noop_dirty_folio,
5228#ifdef CONFIG_TMPFS
5229	.write_begin	= shmem_write_begin,
5230	.write_end	= shmem_write_end,
5231#endif
5232#ifdef CONFIG_MIGRATION
5233	.migrate_folio	= migrate_folio,
5234#endif
5235	.error_remove_folio = shmem_error_remove_folio,
5236};
5237
5238static const struct file_operations shmem_file_operations = {
5239	.mmap_prepare	= shmem_mmap_prepare,
5240	.open		= shmem_file_open,
5241	.get_unmapped_area = shmem_get_unmapped_area,
5242#ifdef CONFIG_TMPFS
5243	.llseek		= shmem_file_llseek,
5244	.read_iter	= shmem_file_read_iter,
5245	.write_iter	= shmem_file_write_iter,
5246	.fsync		= noop_fsync,
5247	.splice_read	= shmem_file_splice_read,
5248	.splice_write	= iter_file_splice_write,
5249	.fallocate	= shmem_fallocate,
5250#endif
5251};
5252
5253static const struct inode_operations shmem_inode_operations = {
5254	.getattr	= shmem_getattr,
5255	.setattr	= shmem_setattr,
5256#ifdef CONFIG_TMPFS_XATTR
5257	.listxattr	= shmem_listxattr,
5258	.set_acl	= simple_set_acl,
5259	.fileattr_get	= shmem_fileattr_get,
5260	.fileattr_set	= shmem_fileattr_set,
5261#endif
5262};
5263
5264static const struct inode_operations shmem_dir_inode_operations = {
5265#ifdef CONFIG_TMPFS
5266	.getattr	= shmem_getattr,
5267	.create		= shmem_create,
5268	.lookup		= simple_lookup,
5269	.link		= shmem_link,
5270	.unlink		= shmem_unlink,
5271	.symlink	= shmem_symlink,
5272	.mkdir		= shmem_mkdir,
5273	.rmdir		= shmem_rmdir,
5274	.mknod		= shmem_mknod,
5275	.rename		= shmem_rename2,
5276	.tmpfile	= shmem_tmpfile,
5277	.get_offset_ctx	= shmem_get_offset_ctx,
5278#endif
5279#ifdef CONFIG_TMPFS_XATTR
5280	.listxattr	= shmem_listxattr,
5281	.fileattr_get	= shmem_fileattr_get,
5282	.fileattr_set	= shmem_fileattr_set,
5283#endif
5284#ifdef CONFIG_TMPFS_POSIX_ACL
5285	.setattr	= shmem_setattr,
5286	.set_acl	= simple_set_acl,
5287#endif
5288};
5289
5290static const struct inode_operations shmem_special_inode_operations = {
5291	.getattr	= shmem_getattr,
5292#ifdef CONFIG_TMPFS_XATTR
5293	.listxattr	= shmem_listxattr,
5294#endif
5295#ifdef CONFIG_TMPFS_POSIX_ACL
5296	.setattr	= shmem_setattr,
5297	.set_acl	= simple_set_acl,
5298#endif
5299};
5300
5301static const struct super_operations shmem_ops = {
5302	.alloc_inode	= shmem_alloc_inode,
5303	.free_inode	= shmem_free_in_core_inode,
5304	.destroy_inode	= shmem_destroy_inode,
5305#ifdef CONFIG_TMPFS
5306	.statfs		= shmem_statfs,
5307	.show_options	= shmem_show_options,
5308#endif
5309#ifdef CONFIG_TMPFS_QUOTA
5310	.get_dquots	= shmem_get_dquots,
5311#endif
5312	.evict_inode	= shmem_evict_inode,
5313	.drop_inode	= inode_just_drop,
5314	.put_super	= shmem_put_super,
5315#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5316	.nr_cached_objects	= shmem_unused_huge_count,
5317	.free_cached_objects	= shmem_unused_huge_scan,
5318#endif
5319};
5320
5321static const struct vm_operations_struct shmem_vm_ops = {
5322	.fault		= shmem_fault,
5323	.map_pages	= filemap_map_pages,
5324#ifdef CONFIG_NUMA
5325	.set_policy     = shmem_set_policy,
5326	.get_policy     = shmem_get_policy,
5327#endif
5328};
5329
5330static const struct vm_operations_struct shmem_anon_vm_ops = {
5331	.fault		= shmem_fault,
5332	.map_pages	= filemap_map_pages,
5333#ifdef CONFIG_NUMA
5334	.set_policy     = shmem_set_policy,
5335	.get_policy     = shmem_get_policy,
5336#endif
5337};
5338
5339int shmem_init_fs_context(struct fs_context *fc)
5340{
5341	struct shmem_options *ctx;
5342
5343	ctx = kzalloc(sizeof(struct shmem_options), GFP_KERNEL);
5344	if (!ctx)
5345		return -ENOMEM;
5346
5347	ctx->mode = 0777 | S_ISVTX;
5348	ctx->uid = current_fsuid();
5349	ctx->gid = current_fsgid();
5350
5351#if IS_ENABLED(CONFIG_UNICODE)
5352	ctx->encoding = NULL;
5353#endif
5354
5355	fc->fs_private = ctx;
5356	fc->ops = &shmem_fs_context_ops;
5357#ifdef CONFIG_TMPFS
5358	fc->sb_flags |= SB_I_VERSION;
5359#endif
5360	return 0;
5361}
5362
5363static struct file_system_type shmem_fs_type = {
5364	.owner		= THIS_MODULE,
5365	.name		= "tmpfs",
5366	.init_fs_context = shmem_init_fs_context,
5367#ifdef CONFIG_TMPFS
5368	.parameters	= shmem_fs_parameters,
5369#endif
5370	.kill_sb	= kill_anon_super,
5371	.fs_flags	= FS_USERNS_MOUNT | FS_ALLOW_IDMAP | FS_MGTIME,
5372};
5373
5374#if defined(CONFIG_SYSFS) && defined(CONFIG_TMPFS)
5375
5376#define __INIT_KOBJ_ATTR(_name, _mode, _show, _store)			\
5377{									\
5378	.attr	= { .name = __stringify(_name), .mode = _mode },	\
5379	.show	= _show,						\
5380	.store	= _store,						\
5381}
5382
5383#define TMPFS_ATTR_W(_name, _store)				\
5384	static struct kobj_attribute tmpfs_attr_##_name =	\
5385			__INIT_KOBJ_ATTR(_name, 0200, NULL, _store)
5386
5387#define TMPFS_ATTR_RW(_name, _show, _store)			\
5388	static struct kobj_attribute tmpfs_attr_##_name =	\
5389			__INIT_KOBJ_ATTR(_name, 0644, _show, _store)
5390
5391#define TMPFS_ATTR_RO(_name, _show)				\
5392	static struct kobj_attribute tmpfs_attr_##_name =	\
5393			__INIT_KOBJ_ATTR(_name, 0444, _show, NULL)
5394
5395#if IS_ENABLED(CONFIG_UNICODE)
5396static ssize_t casefold_show(struct kobject *kobj, struct kobj_attribute *a,
5397			char *buf)
5398{
5399		return sysfs_emit(buf, "supported\n");
5400}
5401TMPFS_ATTR_RO(casefold, casefold_show);
5402#endif
5403
5404static struct attribute *tmpfs_attributes[] = {
5405#if IS_ENABLED(CONFIG_UNICODE)
5406	&tmpfs_attr_casefold.attr,
5407#endif
5408	NULL
5409};
5410
5411static const struct attribute_group tmpfs_attribute_group = {
5412	.attrs = tmpfs_attributes,
5413	.name = "features"
5414};
5415
5416static struct kobject *tmpfs_kobj;
5417
5418static int __init tmpfs_sysfs_init(void)
5419{
5420	int ret;
5421
5422	tmpfs_kobj = kobject_create_and_add("tmpfs", fs_kobj);
5423	if (!tmpfs_kobj)
5424		return -ENOMEM;
5425
5426	ret = sysfs_create_group(tmpfs_kobj, &tmpfs_attribute_group);
5427	if (ret)
5428		kobject_put(tmpfs_kobj);
5429
5430	return ret;
5431}
5432#endif /* CONFIG_SYSFS && CONFIG_TMPFS */
5433
5434void __init shmem_init(void)
5435{
5436	int error;
5437
5438	shmem_init_inodecache();
5439
5440#ifdef CONFIG_TMPFS_QUOTA
5441	register_quota_format(&shmem_quota_format);
5442#endif
5443
5444	error = register_filesystem(&shmem_fs_type);
5445	if (error) {
5446		pr_err("Could not register tmpfs\n");
5447		goto out2;
5448	}
5449
5450	shm_mnt = kern_mount(&shmem_fs_type);
5451	if (IS_ERR(shm_mnt)) {
5452		error = PTR_ERR(shm_mnt);
5453		pr_err("Could not kern_mount tmpfs\n");
5454		goto out1;
5455	}
5456
5457#if defined(CONFIG_SYSFS) && defined(CONFIG_TMPFS)
5458	error = tmpfs_sysfs_init();
5459	if (error) {
5460		pr_err("Could not init tmpfs sysfs\n");
5461		goto out1;
5462	}
5463#endif
5464
5465#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5466	if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY)
5467		SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
5468	else
5469		shmem_huge = SHMEM_HUGE_NEVER; /* just in case it was patched */
5470
5471	/*
5472	 * Default to setting PMD-sized THP to inherit the global setting and
5473	 * disable all other multi-size THPs.
5474	 */
5475	if (!shmem_orders_configured)
5476		huge_shmem_orders_inherit = BIT(HPAGE_PMD_ORDER);
5477#endif
5478	return;
5479
5480out1:
5481	unregister_filesystem(&shmem_fs_type);
5482out2:
5483#ifdef CONFIG_TMPFS_QUOTA
5484	unregister_quota_format(&shmem_quota_format);
5485#endif
5486	shmem_destroy_inodecache();
5487	shm_mnt = ERR_PTR(error);
5488}
5489
5490#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_SYSFS)
5491static ssize_t shmem_enabled_show(struct kobject *kobj,
5492				  struct kobj_attribute *attr, char *buf)
5493{
5494	static const int values[] = {
5495		SHMEM_HUGE_ALWAYS,
5496		SHMEM_HUGE_WITHIN_SIZE,
5497		SHMEM_HUGE_ADVISE,
5498		SHMEM_HUGE_NEVER,
5499		SHMEM_HUGE_DENY,
5500		SHMEM_HUGE_FORCE,
5501	};
5502	int len = 0;
5503	int i;
5504
5505	for (i = 0; i < ARRAY_SIZE(values); i++) {
5506		len += sysfs_emit_at(buf, len,
5507				shmem_huge == values[i] ? "%s[%s]" : "%s%s",
5508				i ? " " : "", shmem_format_huge(values[i]));
5509	}
5510	len += sysfs_emit_at(buf, len, "\n");
5511
5512	return len;
5513}
5514
5515static ssize_t shmem_enabled_store(struct kobject *kobj,
5516		struct kobj_attribute *attr, const char *buf, size_t count)
5517{
5518	char tmp[16];
5519	int huge, err;
5520
5521	if (count + 1 > sizeof(tmp))
5522		return -EINVAL;
5523	memcpy(tmp, buf, count);
5524	tmp[count] = '\0';
5525	if (count && tmp[count - 1] == '\n')
5526		tmp[count - 1] = '\0';
5527
5528	huge = shmem_parse_huge(tmp);
5529	if (huge == -EINVAL)
5530		return huge;
5531
5532	shmem_huge = huge;
5533	if (shmem_huge > SHMEM_HUGE_DENY)
5534		SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
5535
5536	err = start_stop_khugepaged();
5537	return err ? err : count;
5538}
5539
5540struct kobj_attribute shmem_enabled_attr = __ATTR_RW(shmem_enabled);
5541static DEFINE_SPINLOCK(huge_shmem_orders_lock);
5542
5543static ssize_t thpsize_shmem_enabled_show(struct kobject *kobj,
5544					  struct kobj_attribute *attr, char *buf)
5545{
5546	int order = to_thpsize(kobj)->order;
5547	const char *output;
5548
5549	if (test_bit(order, &huge_shmem_orders_always))
5550		output = "[always] inherit within_size advise never";
5551	else if (test_bit(order, &huge_shmem_orders_inherit))
5552		output = "always [inherit] within_size advise never";
5553	else if (test_bit(order, &huge_shmem_orders_within_size))
5554		output = "always inherit [within_size] advise never";
5555	else if (test_bit(order, &huge_shmem_orders_madvise))
5556		output = "always inherit within_size [advise] never";
5557	else
5558		output = "always inherit within_size advise [never]";
5559
5560	return sysfs_emit(buf, "%s\n", output);
5561}
5562
5563static ssize_t thpsize_shmem_enabled_store(struct kobject *kobj,
5564					   struct kobj_attribute *attr,
5565					   const char *buf, size_t count)
5566{
5567	int order = to_thpsize(kobj)->order;
5568	ssize_t ret = count;
5569
5570	if (sysfs_streq(buf, "always")) {
5571		spin_lock(&huge_shmem_orders_lock);
5572		clear_bit(order, &huge_shmem_orders_inherit);
5573		clear_bit(order, &huge_shmem_orders_madvise);
5574		clear_bit(order, &huge_shmem_orders_within_size);
5575		set_bit(order, &huge_shmem_orders_always);
5576		spin_unlock(&huge_shmem_orders_lock);
5577	} else if (sysfs_streq(buf, "inherit")) {
5578		/* Do not override huge allocation policy with non-PMD sized mTHP */
5579		if (shmem_huge == SHMEM_HUGE_FORCE &&
5580		    order != HPAGE_PMD_ORDER)
5581			return -EINVAL;
5582
5583		spin_lock(&huge_shmem_orders_lock);
5584		clear_bit(order, &huge_shmem_orders_always);
5585		clear_bit(order, &huge_shmem_orders_madvise);
5586		clear_bit(order, &huge_shmem_orders_within_size);
5587		set_bit(order, &huge_shmem_orders_inherit);
5588		spin_unlock(&huge_shmem_orders_lock);
5589	} else if (sysfs_streq(buf, "within_size")) {
5590		spin_lock(&huge_shmem_orders_lock);
5591		clear_bit(order, &huge_shmem_orders_always);
5592		clear_bit(order, &huge_shmem_orders_inherit);
5593		clear_bit(order, &huge_shmem_orders_madvise);
5594		set_bit(order, &huge_shmem_orders_within_size);
5595		spin_unlock(&huge_shmem_orders_lock);
5596	} else if (sysfs_streq(buf, "advise")) {
5597		spin_lock(&huge_shmem_orders_lock);
5598		clear_bit(order, &huge_shmem_orders_always);
5599		clear_bit(order, &huge_shmem_orders_inherit);
5600		clear_bit(order, &huge_shmem_orders_within_size);
5601		set_bit(order, &huge_shmem_orders_madvise);
5602		spin_unlock(&huge_shmem_orders_lock);
5603	} else if (sysfs_streq(buf, "never")) {
5604		spin_lock(&huge_shmem_orders_lock);
5605		clear_bit(order, &huge_shmem_orders_always);
5606		clear_bit(order, &huge_shmem_orders_inherit);
5607		clear_bit(order, &huge_shmem_orders_within_size);
5608		clear_bit(order, &huge_shmem_orders_madvise);
5609		spin_unlock(&huge_shmem_orders_lock);
5610	} else {
5611		ret = -EINVAL;
5612	}
5613
5614	if (ret > 0) {
5615		int err = start_stop_khugepaged();
5616
5617		if (err)
5618			ret = err;
5619	}
5620	return ret;
5621}
5622
5623struct kobj_attribute thpsize_shmem_enabled_attr =
5624	__ATTR(shmem_enabled, 0644, thpsize_shmem_enabled_show, thpsize_shmem_enabled_store);
5625#endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_SYSFS */
5626
5627#if defined(CONFIG_TRANSPARENT_HUGEPAGE)
5628
5629static int __init setup_transparent_hugepage_shmem(char *str)
5630{
5631	int huge;
5632
5633	huge = shmem_parse_huge(str);
5634	if (huge == -EINVAL) {
5635		pr_warn("transparent_hugepage_shmem= cannot parse, ignored\n");
5636		return huge;
5637	}
5638
5639	shmem_huge = huge;
5640	return 1;
5641}
5642__setup("transparent_hugepage_shmem=", setup_transparent_hugepage_shmem);
5643
5644static int __init setup_transparent_hugepage_tmpfs(char *str)
5645{
5646	int huge;
5647
5648	huge = shmem_parse_huge(str);
5649	if (huge < 0) {
5650		pr_warn("transparent_hugepage_tmpfs= cannot parse, ignored\n");
5651		return huge;
5652	}
5653
5654	tmpfs_huge = huge;
5655	return 1;
5656}
5657__setup("transparent_hugepage_tmpfs=", setup_transparent_hugepage_tmpfs);
5658
5659static char str_dup[PAGE_SIZE] __initdata;
5660static int __init setup_thp_shmem(char *str)
5661{
5662	char *token, *range, *policy, *subtoken;
5663	unsigned long always, inherit, madvise, within_size;
5664	char *start_size, *end_size;
5665	int start, end, nr;
5666	char *p;
5667
5668	if (!str || strlen(str) + 1 > PAGE_SIZE)
5669		goto err;
5670	strscpy(str_dup, str);
5671
5672	always = huge_shmem_orders_always;
5673	inherit = huge_shmem_orders_inherit;
5674	madvise = huge_shmem_orders_madvise;
5675	within_size = huge_shmem_orders_within_size;
5676	p = str_dup;
5677	while ((token = strsep(&p, ";")) != NULL) {
5678		range = strsep(&token, ":");
5679		policy = token;
5680
5681		if (!policy)
5682			goto err;
5683
5684		while ((subtoken = strsep(&range, ",")) != NULL) {
5685			if (strchr(subtoken, '-')) {
5686				start_size = strsep(&subtoken, "-");
5687				end_size = subtoken;
5688
5689				start = get_order_from_str(start_size,
5690							   THP_ORDERS_ALL_FILE_DEFAULT);
5691				end = get_order_from_str(end_size,
5692							 THP_ORDERS_ALL_FILE_DEFAULT);
5693			} else {
5694				start_size = end_size = subtoken;
5695				start = end = get_order_from_str(subtoken,
5696								 THP_ORDERS_ALL_FILE_DEFAULT);
5697			}
5698
5699			if (start < 0) {
5700				pr_err("invalid size %s in thp_shmem boot parameter\n",
5701				       start_size);
5702				goto err;
5703			}
5704
5705			if (end < 0) {
5706				pr_err("invalid size %s in thp_shmem boot parameter\n",
5707				       end_size);
5708				goto err;
5709			}
5710
5711			if (start > end)
5712				goto err;
5713
5714			nr = end - start + 1;
5715			if (!strcmp(policy, "always")) {
5716				bitmap_set(&always, start, nr);
5717				bitmap_clear(&inherit, start, nr);
5718				bitmap_clear(&madvise, start, nr);
5719				bitmap_clear(&within_size, start, nr);
5720			} else if (!strcmp(policy, "advise")) {
5721				bitmap_set(&madvise, start, nr);
5722				bitmap_clear(&inherit, start, nr);
5723				bitmap_clear(&always, start, nr);
5724				bitmap_clear(&within_size, start, nr);
5725			} else if (!strcmp(policy, "inherit")) {
5726				bitmap_set(&inherit, start, nr);
5727				bitmap_clear(&madvise, start, nr);
5728				bitmap_clear(&always, start, nr);
5729				bitmap_clear(&within_size, start, nr);
5730			} else if (!strcmp(policy, "within_size")) {
5731				bitmap_set(&within_size, start, nr);
5732				bitmap_clear(&inherit, start, nr);
5733				bitmap_clear(&madvise, start, nr);
5734				bitmap_clear(&always, start, nr);
5735			} else if (!strcmp(policy, "never")) {
5736				bitmap_clear(&inherit, start, nr);
5737				bitmap_clear(&madvise, start, nr);
5738				bitmap_clear(&always, start, nr);
5739				bitmap_clear(&within_size, start, nr);
5740			} else {
5741				pr_err("invalid policy %s in thp_shmem boot parameter\n", policy);
5742				goto err;
5743			}
5744		}
5745	}
5746
5747	huge_shmem_orders_always = always;
5748	huge_shmem_orders_madvise = madvise;
5749	huge_shmem_orders_inherit = inherit;
5750	huge_shmem_orders_within_size = within_size;
5751	shmem_orders_configured = true;
5752	return 1;
5753
5754err:
5755	pr_warn("thp_shmem=%s: error parsing string, ignoring setting\n", str);
5756	return 0;
5757}
5758__setup("thp_shmem=", setup_thp_shmem);
5759
5760#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
5761
5762#else /* !CONFIG_SHMEM */
5763
5764/*
5765 * tiny-shmem: simple shmemfs and tmpfs using ramfs code
5766 *
5767 * This is intended for small system where the benefits of the full
5768 * shmem code (swap-backed and resource-limited) are outweighed by
5769 * their complexity. On systems without swap this code should be
5770 * effectively equivalent, but much lighter weight.
5771 */
5772
5773static struct file_system_type shmem_fs_type = {
5774	.name		= "tmpfs",
5775	.init_fs_context = ramfs_init_fs_context,
5776	.parameters	= ramfs_fs_parameters,
5777	.kill_sb	= ramfs_kill_sb,
5778	.fs_flags	= FS_USERNS_MOUNT,
5779};
5780
5781void __init shmem_init(void)
5782{
5783	BUG_ON(register_filesystem(&shmem_fs_type) != 0);
5784
5785	shm_mnt = kern_mount(&shmem_fs_type);
5786	BUG_ON(IS_ERR(shm_mnt));
5787}
5788
5789int shmem_unuse(unsigned int type)
5790{
5791	return 0;
5792}
5793
5794int shmem_lock(struct file *file, int lock, struct ucounts *ucounts)
5795{
5796	return 0;
5797}
5798
5799void shmem_unlock_mapping(struct address_space *mapping)
5800{
5801}
5802
5803#ifdef CONFIG_MMU
5804unsigned long shmem_get_unmapped_area(struct file *file,
5805				      unsigned long addr, unsigned long len,
5806				      unsigned long pgoff, unsigned long flags)
5807{
5808	return mm_get_unmapped_area(file, addr, len, pgoff, flags);
5809}
5810#endif
5811
5812void shmem_truncate_range(struct inode *inode, loff_t lstart, uoff_t lend)
5813{
5814	truncate_inode_pages_range(inode->i_mapping, lstart, lend);
5815}
5816EXPORT_SYMBOL_GPL(shmem_truncate_range);
5817
5818#define shmem_vm_ops				generic_file_vm_ops
5819#define shmem_anon_vm_ops			generic_file_vm_ops
5820#define shmem_file_operations			ramfs_file_operations
5821
5822static inline int shmem_acct_size(unsigned long flags, loff_t size)
5823{
5824	return 0;
5825}
5826
5827static inline void shmem_unacct_size(unsigned long flags, loff_t size)
5828{
5829}
5830
5831static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap,
5832				struct super_block *sb, struct inode *dir,
5833				umode_t mode, dev_t dev, unsigned long flags)
5834{
5835	struct inode *inode = ramfs_get_inode(sb, dir, mode, dev);
5836	return inode ? inode : ERR_PTR(-ENOSPC);
5837}
5838
5839#endif /* CONFIG_SHMEM */
5840
5841/* common code */
5842
5843static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name,
5844				       loff_t size, unsigned long vm_flags,
5845				       unsigned int i_flags)
5846{
5847	unsigned long flags = (vm_flags & VM_NORESERVE) ? SHMEM_F_NORESERVE : 0;
5848	struct inode *inode;
5849	struct file *res;
5850
5851	if (IS_ERR(mnt))
5852		return ERR_CAST(mnt);
5853
5854	if (size < 0 || size > MAX_LFS_FILESIZE)
5855		return ERR_PTR(-EINVAL);
5856
5857	if (is_idmapped_mnt(mnt))
5858		return ERR_PTR(-EINVAL);
5859
5860	if (shmem_acct_size(flags, size))
5861		return ERR_PTR(-ENOMEM);
5862
5863	inode = shmem_get_inode(&nop_mnt_idmap, mnt->mnt_sb, NULL,
5864				S_IFREG | S_IRWXUGO, 0, vm_flags);
5865	if (IS_ERR(inode)) {
5866		shmem_unacct_size(flags, size);
5867		return ERR_CAST(inode);
5868	}
5869	inode->i_flags |= i_flags;
5870	inode->i_size = size;
5871	clear_nlink(inode);	/* It is unlinked */
5872	res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size));
5873	if (!IS_ERR(res))
5874		res = alloc_file_pseudo(inode, mnt, name, O_RDWR,
5875				&shmem_file_operations);
5876	if (IS_ERR(res))
5877		iput(inode);
5878	return res;
5879}
5880
5881/**
5882 * shmem_kernel_file_setup - get an unlinked file living in tmpfs which must be
5883 * 	kernel internal.  There will be NO LSM permission checks against the
5884 * 	underlying inode.  So users of this interface must do LSM checks at a
5885 *	higher layer.  The users are the big_key and shm implementations.  LSM
5886 *	checks are provided at the key or shm level rather than the inode.
5887 * @name: name for dentry (to be seen in /proc/<pid>/maps)
5888 * @size: size to be set for the file
5889 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
5890 */
5891struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags)
5892{
5893	return __shmem_file_setup(shm_mnt, name, size, flags, S_PRIVATE);
5894}
5895EXPORT_SYMBOL_GPL(shmem_kernel_file_setup);
5896
5897/**
5898 * shmem_file_setup - get an unlinked file living in tmpfs
5899 * @name: name for dentry (to be seen in /proc/<pid>/maps)
5900 * @size: size to be set for the file
5901 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
5902 */
5903struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags)
5904{
5905	return __shmem_file_setup(shm_mnt, name, size, flags, 0);
5906}
5907EXPORT_SYMBOL_GPL(shmem_file_setup);
5908
5909/**
5910 * shmem_file_setup_with_mnt - get an unlinked file living in tmpfs
5911 * @mnt: the tmpfs mount where the file will be created
5912 * @name: name for dentry (to be seen in /proc/<pid>/maps)
5913 * @size: size to be set for the file
5914 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
5915 */
5916struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, const char *name,
5917				       loff_t size, unsigned long flags)
5918{
5919	return __shmem_file_setup(mnt, name, size, flags, 0);
5920}
5921EXPORT_SYMBOL_GPL(shmem_file_setup_with_mnt);
5922
5923static struct file *__shmem_zero_setup(unsigned long start, unsigned long end, vm_flags_t vm_flags)
5924{
5925	loff_t size = end - start;
5926
5927	/*
5928	 * Cloning a new file under mmap_lock leads to a lock ordering conflict
5929	 * between XFS directory reading and selinux: since this file is only
5930	 * accessible to the user through its mapping, use S_PRIVATE flag to
5931	 * bypass file security, in the same way as shmem_kernel_file_setup().
5932	 */
5933	return shmem_kernel_file_setup("dev/zero", size, vm_flags);
5934}
5935
5936/**
5937 * shmem_zero_setup - setup a shared anonymous mapping
5938 * @vma: the vma to be mmapped is prepared by do_mmap
5939 * Returns: 0 on success, or error
5940 */
5941int shmem_zero_setup(struct vm_area_struct *vma)
5942{
5943	struct file *file = __shmem_zero_setup(vma->vm_start, vma->vm_end, vma->vm_flags);
5944
5945	if (IS_ERR(file))
5946		return PTR_ERR(file);
5947
5948	if (vma->vm_file)
5949		fput(vma->vm_file);
5950	vma->vm_file = file;
5951	vma->vm_ops = &shmem_anon_vm_ops;
5952
5953	return 0;
5954}
5955
5956/**
5957 * shmem_zero_setup_desc - same as shmem_zero_setup, but determined by VMA
5958 * descriptor for convenience.
5959 * @desc: Describes VMA
5960 * Returns: 0 on success, or error
5961 */
5962int shmem_zero_setup_desc(struct vm_area_desc *desc)
5963{
5964	struct file *file = __shmem_zero_setup(desc->start, desc->end, desc->vm_flags);
5965
5966	if (IS_ERR(file))
5967		return PTR_ERR(file);
5968
5969	desc->vm_file = file;
5970	desc->vm_ops = &shmem_anon_vm_ops;
5971
5972	return 0;
5973}
5974
5975/**
5976 * shmem_read_folio_gfp - read into page cache, using specified page allocation flags.
5977 * @mapping:	the folio's address_space
5978 * @index:	the folio index
5979 * @gfp:	the page allocator flags to use if allocating
5980 *
5981 * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)",
5982 * with any new page allocations done using the specified allocation flags.
5983 * But read_cache_page_gfp() uses the ->read_folio() method: which does not
5984 * suit tmpfs, since it may have pages in swapcache, and needs to find those
5985 * for itself; although drivers/gpu/drm i915 and ttm rely upon this support.
5986 *
5987 * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in
5988 * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily.
5989 */
5990struct folio *shmem_read_folio_gfp(struct address_space *mapping,
5991		pgoff_t index, gfp_t gfp)
5992{
5993#ifdef CONFIG_SHMEM
5994	struct inode *inode = mapping->host;
5995	struct folio *folio;
5996	int error;
5997
5998	error = shmem_get_folio_gfp(inode, index, i_size_read(inode),
5999				    &folio, SGP_CACHE, gfp, NULL, NULL);
6000	if (error)
6001		return ERR_PTR(error);
6002
6003	folio_unlock(folio);
6004	return folio;
6005#else
6006	/*
6007	 * The tiny !SHMEM case uses ramfs without swap
6008	 */
6009	return mapping_read_folio_gfp(mapping, index, gfp);
6010#endif
6011}
6012EXPORT_SYMBOL_GPL(shmem_read_folio_gfp);
6013
6014struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
6015					 pgoff_t index, gfp_t gfp)
6016{
6017	struct folio *folio = shmem_read_folio_gfp(mapping, index, gfp);
6018	struct page *page;
6019
6020	if (IS_ERR(folio))
6021		return &folio->page;
6022
6023	page = folio_file_page(folio, index);
6024	if (PageHWPoison(page)) {
6025		folio_put(folio);
6026		return ERR_PTR(-EIO);
6027	}
6028
6029	return page;
6030}
6031EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp);