mm/shmem.c at master · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / mm / shmem.c
at master 162 kB view raw
   1/*
   2 * Resizable virtual memory filesystem for Linux.
   3 *
   4 * Copyright (C) 2000 Linus Torvalds.
   5 *		 2000 Transmeta Corp.
   6 *		 2000-2001 Christoph Rohland
   7 *		 2000-2001 SAP AG
   8 *		 2002 Red Hat Inc.
   9 * Copyright (C) 2002-2011 Hugh Dickins.
  10 * Copyright (C) 2011 Google Inc.
  11 * Copyright (C) 2002-2005 VERITAS Software Corporation.
  12 * Copyright (C) 2004 Andi Kleen, SuSE Labs
  13 *
  14 * Extended attribute support for tmpfs:
  15 * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net>
  16 * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
  17 *
  18 * tiny-shmem:
  19 * Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com>
  20 *
  21 * This file is released under the GPL.
  22 */
  23
  24#include <linux/fs.h>
  25#include <linux/init.h>
  26#include <linux/vfs.h>
  27#include <linux/mount.h>
  28#include <linux/ramfs.h>
  29#include <linux/pagemap.h>
  30#include <linux/file.h>
  31#include <linux/fileattr.h>
  32#include <linux/mm.h>
  33#include <linux/random.h>
  34#include <linux/sched/signal.h>
  35#include <linux/export.h>
  36#include <linux/shmem_fs.h>
  37#include <linux/swap.h>
  38#include <linux/uio.h>
  39#include <linux/hugetlb.h>
  40#include <linux/fs_parser.h>
  41#include <linux/swapfile.h>
  42#include <linux/iversion.h>
  43#include <linux/unicode.h>
  44#include "swap.h"
  45
  46static struct vfsmount *shm_mnt __ro_after_init;
  47
  48#ifdef CONFIG_SHMEM
  49/*
  50 * This virtual memory filesystem is heavily based on the ramfs. It
  51 * extends ramfs by the ability to use swap and honor resource limits
  52 * which makes it a completely usable filesystem.
  53 */
  54
  55#include <linux/xattr.h>
  56#include <linux/exportfs.h>
  57#include <linux/posix_acl.h>
  58#include <linux/posix_acl_xattr.h>
  59#include <linux/mman.h>
  60#include <linux/string.h>
  61#include <linux/slab.h>
  62#include <linux/backing-dev.h>
  63#include <linux/writeback.h>
  64#include <linux/pagevec.h>
  65#include <linux/percpu_counter.h>
  66#include <linux/falloc.h>
  67#include <linux/splice.h>
  68#include <linux/security.h>
  69#include <linux/leafops.h>
  70#include <linux/mempolicy.h>
  71#include <linux/namei.h>
  72#include <linux/ctype.h>
  73#include <linux/migrate.h>
  74#include <linux/highmem.h>
  75#include <linux/seq_file.h>
  76#include <linux/magic.h>
  77#include <linux/syscalls.h>
  78#include <linux/fcntl.h>
  79#include <uapi/linux/memfd.h>
  80#include <linux/rmap.h>
  81#include <linux/uuid.h>
  82#include <linux/quotaops.h>
  83#include <linux/rcupdate_wait.h>
  84
  85#include <linux/uaccess.h>
  86
  87#include "internal.h"
  88
  89#define VM_ACCT(size)    (PAGE_ALIGN(size) >> PAGE_SHIFT)
  90
  91/* Pretend that each entry is of this size in directory's i_size */
  92#define BOGO_DIRENT_SIZE 20
  93
  94/* Pretend that one inode + its dentry occupy this much memory */
  95#define BOGO_INODE_SIZE 1024
  96
  97/* Symlink up to this size is kmalloc'ed instead of using a swappable page */
  98#define SHORT_SYMLINK_LEN 128
  99
 100/*
 101 * shmem_fallocate communicates with shmem_fault or shmem_writeout via
 102 * inode->i_private (with i_rwsem making sure that it has only one user at
 103 * a time): we would prefer not to enlarge the shmem inode just for that.
 104 */
 105struct shmem_falloc {
 106	wait_queue_head_t *waitq; /* faults into hole wait for punch to end */
 107	pgoff_t start;		/* start of range currently being fallocated */
 108	pgoff_t next;		/* the next page offset to be fallocated */
 109	pgoff_t nr_falloced;	/* how many new pages have been fallocated */
 110	pgoff_t nr_unswapped;	/* how often writeout refused to swap out */
 111};
 112
 113struct shmem_options {
 114	unsigned long long blocks;
 115	unsigned long long inodes;
 116	struct mempolicy *mpol;
 117	kuid_t uid;
 118	kgid_t gid;
 119	umode_t mode;
 120	bool full_inums;
 121	int huge;
 122	int seen;
 123	bool noswap;
 124	unsigned short quota_types;
 125	struct shmem_quota_limits qlimits;
 126#if IS_ENABLED(CONFIG_UNICODE)
 127	struct unicode_map *encoding;
 128	bool strict_encoding;
 129#endif
 130#define SHMEM_SEEN_BLOCKS 1
 131#define SHMEM_SEEN_INODES 2
 132#define SHMEM_SEEN_HUGE 4
 133#define SHMEM_SEEN_INUMS 8
 134#define SHMEM_SEEN_QUOTA 16
 135};
 136
 137#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 138static unsigned long huge_shmem_orders_always __read_mostly;
 139static unsigned long huge_shmem_orders_madvise __read_mostly;
 140static unsigned long huge_shmem_orders_inherit __read_mostly;
 141static unsigned long huge_shmem_orders_within_size __read_mostly;
 142static bool shmem_orders_configured __initdata;
 143#endif
 144
 145#ifdef CONFIG_TMPFS
 146static unsigned long shmem_default_max_blocks(void)
 147{
 148	return totalram_pages() / 2;
 149}
 150
 151static unsigned long shmem_default_max_inodes(void)
 152{
 153	unsigned long nr_pages = totalram_pages();
 154
 155	return min3(nr_pages - totalhigh_pages(), nr_pages / 2,
 156			ULONG_MAX / BOGO_INODE_SIZE);
 157}
 158#endif
 159
 160static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
 161			struct folio **foliop, enum sgp_type sgp, gfp_t gfp,
 162			struct vm_area_struct *vma, vm_fault_t *fault_type);
 163
 164static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
 165{
 166	return sb->s_fs_info;
 167}
 168
 169/*
 170 * shmem_file_setup pre-accounts the whole fixed size of a VM object,
 171 * for shared memory and for shared anonymous (/dev/zero) mappings
 172 * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1),
 173 * consistent with the pre-accounting of private mappings ...
 174 */
 175static inline int shmem_acct_size(unsigned long flags, loff_t size)
 176{
 177	return (flags & SHMEM_F_NORESERVE) ?
 178		0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size));
 179}
 180
 181static inline void shmem_unacct_size(unsigned long flags, loff_t size)
 182{
 183	if (!(flags & SHMEM_F_NORESERVE))
 184		vm_unacct_memory(VM_ACCT(size));
 185}
 186
 187static inline int shmem_reacct_size(unsigned long flags,
 188		loff_t oldsize, loff_t newsize)
 189{
 190	if (!(flags & SHMEM_F_NORESERVE)) {
 191		if (VM_ACCT(newsize) > VM_ACCT(oldsize))
 192			return security_vm_enough_memory_mm(current->mm,
 193					VM_ACCT(newsize) - VM_ACCT(oldsize));
 194		else if (VM_ACCT(newsize) < VM_ACCT(oldsize))
 195			vm_unacct_memory(VM_ACCT(oldsize) - VM_ACCT(newsize));
 196	}
 197	return 0;
 198}
 199
 200/*
 201 * ... whereas tmpfs objects are accounted incrementally as
 202 * pages are allocated, in order to allow large sparse files.
 203 * shmem_get_folio reports shmem_acct_blocks failure as -ENOSPC not -ENOMEM,
 204 * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
 205 */
 206static inline int shmem_acct_blocks(unsigned long flags, long pages)
 207{
 208	if (!(flags & SHMEM_F_NORESERVE))
 209		return 0;
 210
 211	return security_vm_enough_memory_mm(current->mm,
 212			pages * VM_ACCT(PAGE_SIZE));
 213}
 214
 215static inline void shmem_unacct_blocks(unsigned long flags, long pages)
 216{
 217	if (flags & SHMEM_F_NORESERVE)
 218		vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE));
 219}
 220
 221int shmem_inode_acct_blocks(struct inode *inode, long pages)
 222{
 223	struct shmem_inode_info *info = SHMEM_I(inode);
 224	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
 225	int err = -ENOSPC;
 226
 227	if (shmem_acct_blocks(info->flags, pages))
 228		return err;
 229
 230	might_sleep();	/* when quotas */
 231	if (sbinfo->max_blocks) {
 232		if (!percpu_counter_limited_add(&sbinfo->used_blocks,
 233						sbinfo->max_blocks, pages))
 234			goto unacct;
 235
 236		err = dquot_alloc_block_nodirty(inode, pages);
 237		if (err) {
 238			percpu_counter_sub(&sbinfo->used_blocks, pages);
 239			goto unacct;
 240		}
 241	} else {
 242		err = dquot_alloc_block_nodirty(inode, pages);
 243		if (err)
 244			goto unacct;
 245	}
 246
 247	return 0;
 248
 249unacct:
 250	shmem_unacct_blocks(info->flags, pages);
 251	return err;
 252}
 253
 254static void shmem_inode_unacct_blocks(struct inode *inode, long pages)
 255{
 256	struct shmem_inode_info *info = SHMEM_I(inode);
 257	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
 258
 259	might_sleep();	/* when quotas */
 260	dquot_free_block_nodirty(inode, pages);
 261
 262	if (sbinfo->max_blocks)
 263		percpu_counter_sub(&sbinfo->used_blocks, pages);
 264	shmem_unacct_blocks(info->flags, pages);
 265}
 266
 267static const struct super_operations shmem_ops;
 268static const struct address_space_operations shmem_aops;
 269static const struct file_operations shmem_file_operations;
 270static const struct inode_operations shmem_inode_operations;
 271static const struct inode_operations shmem_dir_inode_operations;
 272static const struct inode_operations shmem_special_inode_operations;
 273static const struct vm_operations_struct shmem_vm_ops;
 274static const struct vm_operations_struct shmem_anon_vm_ops;
 275static struct file_system_type shmem_fs_type;
 276
 277bool shmem_mapping(const struct address_space *mapping)
 278{
 279	return mapping->a_ops == &shmem_aops;
 280}
 281EXPORT_SYMBOL_GPL(shmem_mapping);
 282
 283bool vma_is_anon_shmem(const struct vm_area_struct *vma)
 284{
 285	return vma->vm_ops == &shmem_anon_vm_ops;
 286}
 287
 288bool vma_is_shmem(const struct vm_area_struct *vma)
 289{
 290	return vma_is_anon_shmem(vma) || vma->vm_ops == &shmem_vm_ops;
 291}
 292
 293static LIST_HEAD(shmem_swaplist);
 294static DEFINE_SPINLOCK(shmem_swaplist_lock);
 295
 296#ifdef CONFIG_TMPFS_QUOTA
 297
 298static int shmem_enable_quotas(struct super_block *sb,
 299			       unsigned short quota_types)
 300{
 301	int type, err = 0;
 302
 303	sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NOLIST_DIRTY;
 304	for (type = 0; type < SHMEM_MAXQUOTAS; type++) {
 305		if (!(quota_types & (1 << type)))
 306			continue;
 307		err = dquot_load_quota_sb(sb, type, QFMT_SHMEM,
 308					  DQUOT_USAGE_ENABLED |
 309					  DQUOT_LIMITS_ENABLED);
 310		if (err)
 311			goto out_err;
 312	}
 313	return 0;
 314
 315out_err:
 316	pr_warn("tmpfs: failed to enable quota tracking (type=%d, err=%d)\n",
 317		type, err);
 318	for (type--; type >= 0; type--)
 319		dquot_quota_off(sb, type);
 320	return err;
 321}
 322
 323static void shmem_disable_quotas(struct super_block *sb)
 324{
 325	int type;
 326
 327	for (type = 0; type < SHMEM_MAXQUOTAS; type++)
 328		dquot_quota_off(sb, type);
 329}
 330
 331static struct dquot __rcu **shmem_get_dquots(struct inode *inode)
 332{
 333	return SHMEM_I(inode)->i_dquot;
 334}
 335#endif /* CONFIG_TMPFS_QUOTA */
 336
 337/*
 338 * shmem_reserve_inode() performs bookkeeping to reserve a shmem inode, and
 339 * produces a novel ino for the newly allocated inode.
 340 *
 341 * It may also be called when making a hard link to permit the space needed by
 342 * each dentry. However, in that case, no new inode number is needed since that
 343 * internally draws from another pool of inode numbers (currently global
 344 * get_next_ino()). This case is indicated by passing NULL as inop.
 345 */
 346#define SHMEM_INO_BATCH 1024
 347static int shmem_reserve_inode(struct super_block *sb, ino_t *inop)
 348{
 349	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
 350	ino_t ino;
 351
 352	if (!(sb->s_flags & SB_KERNMOUNT)) {
 353		raw_spin_lock(&sbinfo->stat_lock);
 354		if (sbinfo->max_inodes) {
 355			if (sbinfo->free_ispace < BOGO_INODE_SIZE) {
 356				raw_spin_unlock(&sbinfo->stat_lock);
 357				return -ENOSPC;
 358			}
 359			sbinfo->free_ispace -= BOGO_INODE_SIZE;
 360		}
 361		if (inop) {
 362			ino = sbinfo->next_ino++;
 363			if (unlikely(is_zero_ino(ino)))
 364				ino = sbinfo->next_ino++;
 365			if (unlikely(!sbinfo->full_inums &&
 366				     ino > UINT_MAX)) {
 367				/*
 368				 * Emulate get_next_ino uint wraparound for
 369				 * compatibility
 370				 */
 371				if (IS_ENABLED(CONFIG_64BIT))
 372					pr_warn("%s: inode number overflow on device %d, consider using inode64 mount option\n",
 373						__func__, MINOR(sb->s_dev));
 374				sbinfo->next_ino = 1;
 375				ino = sbinfo->next_ino++;
 376			}
 377			*inop = ino;
 378		}
 379		raw_spin_unlock(&sbinfo->stat_lock);
 380	} else if (inop) {
 381		/*
 382		 * __shmem_file_setup, one of our callers, is lock-free: it
 383		 * doesn't hold stat_lock in shmem_reserve_inode since
 384		 * max_inodes is always 0, and is called from potentially
 385		 * unknown contexts. As such, use a per-cpu batched allocator
 386		 * which doesn't require the per-sb stat_lock unless we are at
 387		 * the batch boundary.
 388		 *
 389		 * We don't need to worry about inode{32,64} since SB_KERNMOUNT
 390		 * shmem mounts are not exposed to userspace, so we don't need
 391		 * to worry about things like glibc compatibility.
 392		 */
 393		ino_t *next_ino;
 394
 395		next_ino = per_cpu_ptr(sbinfo->ino_batch, get_cpu());
 396		ino = *next_ino;
 397		if (unlikely(ino % SHMEM_INO_BATCH == 0)) {
 398			raw_spin_lock(&sbinfo->stat_lock);
 399			ino = sbinfo->next_ino;
 400			sbinfo->next_ino += SHMEM_INO_BATCH;
 401			raw_spin_unlock(&sbinfo->stat_lock);
 402			if (unlikely(is_zero_ino(ino)))
 403				ino++;
 404		}
 405		*inop = ino;
 406		*next_ino = ++ino;
 407		put_cpu();
 408	}
 409
 410	return 0;
 411}
 412
 413static void shmem_free_inode(struct super_block *sb, size_t freed_ispace)
 414{
 415	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
 416	if (sbinfo->max_inodes) {
 417		raw_spin_lock(&sbinfo->stat_lock);
 418		sbinfo->free_ispace += BOGO_INODE_SIZE + freed_ispace;
 419		raw_spin_unlock(&sbinfo->stat_lock);
 420	}
 421}
 422
 423/**
 424 * shmem_recalc_inode - recalculate the block usage of an inode
 425 * @inode: inode to recalc
 426 * @alloced: the change in number of pages allocated to inode
 427 * @swapped: the change in number of pages swapped from inode
 428 *
 429 * We have to calculate the free blocks since the mm can drop
 430 * undirtied hole pages behind our back.
 431 *
 432 * But normally   info->alloced == inode->i_mapping->nrpages + info->swapped
 433 * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped)
 434 *
 435 * Return: true if swapped was incremented from 0, for shmem_writeout().
 436 */
 437bool shmem_recalc_inode(struct inode *inode, long alloced, long swapped)
 438{
 439	struct shmem_inode_info *info = SHMEM_I(inode);
 440	bool first_swapped = false;
 441	long freed;
 442
 443	spin_lock(&info->lock);
 444	info->alloced += alloced;
 445	info->swapped += swapped;
 446	freed = info->alloced - info->swapped -
 447		READ_ONCE(inode->i_mapping->nrpages);
 448	/*
 449	 * Special case: whereas normally shmem_recalc_inode() is called
 450	 * after i_mapping->nrpages has already been adjusted (up or down),
 451	 * shmem_writeout() has to raise swapped before nrpages is lowered -
 452	 * to stop a racing shmem_recalc_inode() from thinking that a page has
 453	 * been freed.  Compensate here, to avoid the need for a followup call.
 454	 */
 455	if (swapped > 0) {
 456		if (info->swapped == swapped)
 457			first_swapped = true;
 458		freed += swapped;
 459	}
 460	if (freed > 0)
 461		info->alloced -= freed;
 462	spin_unlock(&info->lock);
 463
 464	/* The quota case may block */
 465	if (freed > 0)
 466		shmem_inode_unacct_blocks(inode, freed);
 467	return first_swapped;
 468}
 469
 470bool shmem_charge(struct inode *inode, long pages)
 471{
 472	struct address_space *mapping = inode->i_mapping;
 473
 474	if (shmem_inode_acct_blocks(inode, pages))
 475		return false;
 476
 477	/* nrpages adjustment first, then shmem_recalc_inode() when balanced */
 478	xa_lock_irq(&mapping->i_pages);
 479	mapping->nrpages += pages;
 480	xa_unlock_irq(&mapping->i_pages);
 481
 482	shmem_recalc_inode(inode, pages, 0);
 483	return true;
 484}
 485
 486void shmem_uncharge(struct inode *inode, long pages)
 487{
 488	/* pages argument is currently unused: keep it to help debugging */
 489	/* nrpages adjustment done by __filemap_remove_folio() or caller */
 490
 491	shmem_recalc_inode(inode, 0, 0);
 492}
 493
 494/*
 495 * Replace item expected in xarray by a new item, while holding xa_lock.
 496 */
 497static int shmem_replace_entry(struct address_space *mapping,
 498			pgoff_t index, void *expected, void *replacement)
 499{
 500	XA_STATE(xas, &mapping->i_pages, index);
 501	void *item;
 502
 503	VM_BUG_ON(!expected);
 504	VM_BUG_ON(!replacement);
 505	item = xas_load(&xas);
 506	if (item != expected)
 507		return -ENOENT;
 508	xas_store(&xas, replacement);
 509	return 0;
 510}
 511
 512/*
 513 * Sometimes, before we decide whether to proceed or to fail, we must check
 514 * that an entry was not already brought back or split by a racing thread.
 515 *
 516 * Checking folio is not enough: by the time a swapcache folio is locked, it
 517 * might be reused, and again be swapcache, using the same swap as before.
 518 * Returns the swap entry's order if it still presents, else returns -1.
 519 */
 520static int shmem_confirm_swap(struct address_space *mapping, pgoff_t index,
 521			      swp_entry_t swap)
 522{
 523	XA_STATE(xas, &mapping->i_pages, index);
 524	int ret = -1;
 525	void *entry;
 526
 527	rcu_read_lock();
 528	do {
 529		entry = xas_load(&xas);
 530		if (entry == swp_to_radix_entry(swap))
 531			ret = xas_get_order(&xas);
 532	} while (xas_retry(&xas, entry));
 533	rcu_read_unlock();
 534	return ret;
 535}
 536
 537/*
 538 * Definitions for "huge tmpfs": tmpfs mounted with the huge= option
 539 *
 540 * SHMEM_HUGE_NEVER:
 541 *	disables huge pages for the mount;
 542 * SHMEM_HUGE_ALWAYS:
 543 *	enables huge pages for the mount;
 544 * SHMEM_HUGE_WITHIN_SIZE:
 545 *	only allocate huge pages if the page will be fully within i_size,
 546 *	also respect madvise() hints;
 547 * SHMEM_HUGE_ADVISE:
 548 *	only allocate huge pages if requested with madvise();
 549 */
 550
 551#define SHMEM_HUGE_NEVER	0
 552#define SHMEM_HUGE_ALWAYS	1
 553#define SHMEM_HUGE_WITHIN_SIZE	2
 554#define SHMEM_HUGE_ADVISE	3
 555
 556/*
 557 * Special values.
 558 * Only can be set via /sys/kernel/mm/transparent_hugepage/shmem_enabled:
 559 *
 560 * SHMEM_HUGE_DENY:
 561 *	disables huge on shm_mnt and all mounts, for emergency use;
 562 * SHMEM_HUGE_FORCE:
 563 *	enables huge on shm_mnt and all mounts, w/o needing option, for testing;
 564 *
 565 */
 566#define SHMEM_HUGE_DENY		(-1)
 567#define SHMEM_HUGE_FORCE	(-2)
 568
 569#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 570/* ifdef here to avoid bloating shmem.o when not necessary */
 571
 572#if defined(CONFIG_TRANSPARENT_HUGEPAGE_SHMEM_HUGE_NEVER)
 573#define SHMEM_HUGE_DEFAULT SHMEM_HUGE_NEVER
 574#elif defined(CONFIG_TRANSPARENT_HUGEPAGE_SHMEM_HUGE_ALWAYS)
 575#define SHMEM_HUGE_DEFAULT SHMEM_HUGE_ALWAYS
 576#elif defined(CONFIG_TRANSPARENT_HUGEPAGE_SHMEM_HUGE_WITHIN_SIZE)
 577#define SHMEM_HUGE_DEFAULT SHMEM_HUGE_WITHIN_SIZE
 578#elif defined(CONFIG_TRANSPARENT_HUGEPAGE_SHMEM_HUGE_ADVISE)
 579#define SHMEM_HUGE_DEFAULT SHMEM_HUGE_ADVISE
 580#else
 581#define SHMEM_HUGE_DEFAULT SHMEM_HUGE_NEVER
 582#endif
 583
 584static int shmem_huge __read_mostly = SHMEM_HUGE_DEFAULT;
 585
 586#undef SHMEM_HUGE_DEFAULT
 587
 588#if defined(CONFIG_TRANSPARENT_HUGEPAGE_TMPFS_HUGE_NEVER)
 589#define TMPFS_HUGE_DEFAULT SHMEM_HUGE_NEVER
 590#elif defined(CONFIG_TRANSPARENT_HUGEPAGE_TMPFS_HUGE_ALWAYS)
 591#define TMPFS_HUGE_DEFAULT SHMEM_HUGE_ALWAYS
 592#elif defined(CONFIG_TRANSPARENT_HUGEPAGE_TMPFS_HUGE_WITHIN_SIZE)
 593#define TMPFS_HUGE_DEFAULT SHMEM_HUGE_WITHIN_SIZE
 594#elif defined(CONFIG_TRANSPARENT_HUGEPAGE_TMPFS_HUGE_ADVISE)
 595#define TMPFS_HUGE_DEFAULT SHMEM_HUGE_ADVISE
 596#else
 597#define TMPFS_HUGE_DEFAULT SHMEM_HUGE_NEVER
 598#endif
 599
 600static int tmpfs_huge __read_mostly = TMPFS_HUGE_DEFAULT;
 601
 602#undef TMPFS_HUGE_DEFAULT
 603
 604static unsigned int shmem_get_orders_within_size(struct inode *inode,
 605		unsigned long within_size_orders, pgoff_t index,
 606		loff_t write_end)
 607{
 608	pgoff_t aligned_index;
 609	unsigned long order;
 610	loff_t i_size;
 611
 612	order = highest_order(within_size_orders);
 613	while (within_size_orders) {
 614		aligned_index = round_up(index + 1, 1 << order);
 615		i_size = max(write_end, i_size_read(inode));
 616		i_size = round_up(i_size, PAGE_SIZE);
 617		if (i_size >> PAGE_SHIFT >= aligned_index)
 618			return within_size_orders;
 619
 620		order = next_order(&within_size_orders, order);
 621	}
 622
 623	return 0;
 624}
 625
 626static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index,
 627					      loff_t write_end, bool shmem_huge_force,
 628					      struct vm_area_struct *vma,
 629					      vm_flags_t vm_flags)
 630{
 631	unsigned int maybe_pmd_order = HPAGE_PMD_ORDER > MAX_PAGECACHE_ORDER ?
 632		0 : BIT(HPAGE_PMD_ORDER);
 633	unsigned long within_size_orders;
 634
 635	if (!S_ISREG(inode->i_mode))
 636		return 0;
 637	if (shmem_huge == SHMEM_HUGE_DENY)
 638		return 0;
 639	if (shmem_huge_force || shmem_huge == SHMEM_HUGE_FORCE)
 640		return maybe_pmd_order;
 641
 642	/*
 643	 * The huge order allocation for anon shmem is controlled through
 644	 * the mTHP interface, so we still use PMD-sized huge order to
 645	 * check whether global control is enabled.
 646	 *
 647	 * For tmpfs with 'huge=always' or 'huge=within_size' mount option,
 648	 * we will always try PMD-sized order first. If that failed, it will
 649	 * fall back to small large folios.
 650	 */
 651	switch (SHMEM_SB(inode->i_sb)->huge) {
 652	case SHMEM_HUGE_ALWAYS:
 653		return THP_ORDERS_ALL_FILE_DEFAULT;
 654	case SHMEM_HUGE_WITHIN_SIZE:
 655		within_size_orders = shmem_get_orders_within_size(inode,
 656				THP_ORDERS_ALL_FILE_DEFAULT, index, write_end);
 657		if (within_size_orders > 0)
 658			return within_size_orders;
 659
 660		fallthrough;
 661	case SHMEM_HUGE_ADVISE:
 662		if (vm_flags & VM_HUGEPAGE)
 663			return THP_ORDERS_ALL_FILE_DEFAULT;
 664		fallthrough;
 665	default:
 666		return 0;
 667	}
 668}
 669
 670static int shmem_parse_huge(const char *str)
 671{
 672	int huge;
 673
 674	if (!str)
 675		return -EINVAL;
 676
 677	if (!strcmp(str, "never"))
 678		huge = SHMEM_HUGE_NEVER;
 679	else if (!strcmp(str, "always"))
 680		huge = SHMEM_HUGE_ALWAYS;
 681	else if (!strcmp(str, "within_size"))
 682		huge = SHMEM_HUGE_WITHIN_SIZE;
 683	else if (!strcmp(str, "advise"))
 684		huge = SHMEM_HUGE_ADVISE;
 685	else if (!strcmp(str, "deny"))
 686		huge = SHMEM_HUGE_DENY;
 687	else if (!strcmp(str, "force"))
 688		huge = SHMEM_HUGE_FORCE;
 689	else
 690		return -EINVAL;
 691
 692	if (!has_transparent_hugepage() &&
 693	    huge != SHMEM_HUGE_NEVER && huge != SHMEM_HUGE_DENY)
 694		return -EINVAL;
 695
 696	/* Do not override huge allocation policy with non-PMD sized mTHP */
 697	if (huge == SHMEM_HUGE_FORCE &&
 698	    huge_shmem_orders_inherit != BIT(HPAGE_PMD_ORDER))
 699		return -EINVAL;
 700
 701	return huge;
 702}
 703
 704#if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS)
 705static const char *shmem_format_huge(int huge)
 706{
 707	switch (huge) {
 708	case SHMEM_HUGE_NEVER:
 709		return "never";
 710	case SHMEM_HUGE_ALWAYS:
 711		return "always";
 712	case SHMEM_HUGE_WITHIN_SIZE:
 713		return "within_size";
 714	case SHMEM_HUGE_ADVISE:
 715		return "advise";
 716	case SHMEM_HUGE_DENY:
 717		return "deny";
 718	case SHMEM_HUGE_FORCE:
 719		return "force";
 720	default:
 721		VM_BUG_ON(1);
 722		return "bad_val";
 723	}
 724}
 725#endif
 726
 727static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
 728		struct shrink_control *sc, unsigned long nr_to_free)
 729{
 730	LIST_HEAD(list), *pos, *next;
 731	struct inode *inode;
 732	struct shmem_inode_info *info;
 733	struct folio *folio;
 734	unsigned long batch = sc ? sc->nr_to_scan : 128;
 735	unsigned long split = 0, freed = 0;
 736
 737	if (list_empty(&sbinfo->shrinklist))
 738		return SHRINK_STOP;
 739
 740	spin_lock(&sbinfo->shrinklist_lock);
 741	list_for_each_safe(pos, next, &sbinfo->shrinklist) {
 742		info = list_entry(pos, struct shmem_inode_info, shrinklist);
 743
 744		/* pin the inode */
 745		inode = igrab(&info->vfs_inode);
 746
 747		/* inode is about to be evicted */
 748		if (!inode) {
 749			list_del_init(&info->shrinklist);
 750			goto next;
 751		}
 752
 753		list_move(&info->shrinklist, &list);
 754next:
 755		sbinfo->shrinklist_len--;
 756		if (!--batch)
 757			break;
 758	}
 759	spin_unlock(&sbinfo->shrinklist_lock);
 760
 761	list_for_each_safe(pos, next, &list) {
 762		pgoff_t next, end;
 763		loff_t i_size;
 764		int ret;
 765
 766		info = list_entry(pos, struct shmem_inode_info, shrinklist);
 767		inode = &info->vfs_inode;
 768
 769		if (nr_to_free && freed >= nr_to_free)
 770			goto move_back;
 771
 772		i_size = i_size_read(inode);
 773		folio = filemap_get_entry(inode->i_mapping, i_size / PAGE_SIZE);
 774		if (!folio || xa_is_value(folio))
 775			goto drop;
 776
 777		/* No large folio at the end of the file: nothing to split */
 778		if (!folio_test_large(folio)) {
 779			folio_put(folio);
 780			goto drop;
 781		}
 782
 783		/* Check if there is anything to gain from splitting */
 784		next = folio_next_index(folio);
 785		end = shmem_fallocend(inode, DIV_ROUND_UP(i_size, PAGE_SIZE));
 786		if (end <= folio->index || end >= next) {
 787			folio_put(folio);
 788			goto drop;
 789		}
 790
 791		/*
 792		 * Move the inode on the list back to shrinklist if we failed
 793		 * to lock the page at this time.
 794		 *
 795		 * Waiting for the lock may lead to deadlock in the
 796		 * reclaim path.
 797		 */
 798		if (!folio_trylock(folio)) {
 799			folio_put(folio);
 800			goto move_back;
 801		}
 802
 803		ret = split_folio(folio);
 804		folio_unlock(folio);
 805		folio_put(folio);
 806
 807		/* If split failed move the inode on the list back to shrinklist */
 808		if (ret)
 809			goto move_back;
 810
 811		freed += next - end;
 812		split++;
 813drop:
 814		list_del_init(&info->shrinklist);
 815		goto put;
 816move_back:
 817		/*
 818		 * Make sure the inode is either on the global list or deleted
 819		 * from any local list before iput() since it could be deleted
 820		 * in another thread once we put the inode (then the local list
 821		 * is corrupted).
 822		 */
 823		spin_lock(&sbinfo->shrinklist_lock);
 824		list_move(&info->shrinklist, &sbinfo->shrinklist);
 825		sbinfo->shrinklist_len++;
 826		spin_unlock(&sbinfo->shrinklist_lock);
 827put:
 828		iput(inode);
 829	}
 830
 831	return split;
 832}
 833
 834static long shmem_unused_huge_scan(struct super_block *sb,
 835		struct shrink_control *sc)
 836{
 837	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
 838
 839	if (!READ_ONCE(sbinfo->shrinklist_len))
 840		return SHRINK_STOP;
 841
 842	return shmem_unused_huge_shrink(sbinfo, sc, 0);
 843}
 844
 845static long shmem_unused_huge_count(struct super_block *sb,
 846		struct shrink_control *sc)
 847{
 848	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
 849	return READ_ONCE(sbinfo->shrinklist_len);
 850}
 851#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
 852
 853#define shmem_huge SHMEM_HUGE_DENY
 854
 855static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
 856		struct shrink_control *sc, unsigned long nr_to_free)
 857{
 858	return 0;
 859}
 860
 861static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index,
 862					      loff_t write_end, bool shmem_huge_force,
 863					      struct vm_area_struct *vma,
 864					      vm_flags_t vm_flags)
 865{
 866	return 0;
 867}
 868#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 869
 870static void shmem_update_stats(struct folio *folio, int nr_pages)
 871{
 872	if (folio_test_pmd_mappable(folio))
 873		lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr_pages);
 874	lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages);
 875	lruvec_stat_mod_folio(folio, NR_SHMEM, nr_pages);
 876}
 877
 878/*
 879 * Somewhat like filemap_add_folio, but error if expected item has gone.
 880 */
 881int shmem_add_to_page_cache(struct folio *folio,
 882			    struct address_space *mapping,
 883			    pgoff_t index, void *expected, gfp_t gfp)
 884{
 885	XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio));
 886	unsigned long nr = folio_nr_pages(folio);
 887	swp_entry_t iter, swap;
 888	void *entry;
 889
 890	VM_BUG_ON_FOLIO(index != round_down(index, nr), folio);
 891	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
 892	VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio);
 893
 894	folio_ref_add(folio, nr);
 895	folio->mapping = mapping;
 896	folio->index = index;
 897
 898	gfp &= GFP_RECLAIM_MASK;
 899	folio_throttle_swaprate(folio, gfp);
 900	swap = radix_to_swp_entry(expected);
 901
 902	do {
 903		iter = swap;
 904		xas_lock_irq(&xas);
 905		xas_for_each_conflict(&xas, entry) {
 906			/*
 907			 * The range must either be empty, or filled with
 908			 * expected swap entries. Shmem swap entries are never
 909			 * partially freed without split of both entry and
 910			 * folio, so there shouldn't be any holes.
 911			 */
 912			if (!expected || entry != swp_to_radix_entry(iter)) {
 913				xas_set_err(&xas, -EEXIST);
 914				goto unlock;
 915			}
 916			iter.val += 1 << xas_get_order(&xas);
 917		}
 918		if (expected && iter.val - nr != swap.val) {
 919			xas_set_err(&xas, -EEXIST);
 920			goto unlock;
 921		}
 922		xas_store(&xas, folio);
 923		if (xas_error(&xas))
 924			goto unlock;
 925		shmem_update_stats(folio, nr);
 926		mapping->nrpages += nr;
 927unlock:
 928		xas_unlock_irq(&xas);
 929	} while (xas_nomem(&xas, gfp));
 930
 931	if (xas_error(&xas)) {
 932		folio->mapping = NULL;
 933		folio_ref_sub(folio, nr);
 934		return xas_error(&xas);
 935	}
 936
 937	return 0;
 938}
 939
 940/*
 941 * Somewhat like filemap_remove_folio, but substitutes swap for @folio.
 942 */
 943static void shmem_delete_from_page_cache(struct folio *folio, void *radswap)
 944{
 945	struct address_space *mapping = folio->mapping;
 946	long nr = folio_nr_pages(folio);
 947	int error;
 948
 949	xa_lock_irq(&mapping->i_pages);
 950	error = shmem_replace_entry(mapping, folio->index, folio, radswap);
 951	folio->mapping = NULL;
 952	mapping->nrpages -= nr;
 953	shmem_update_stats(folio, -nr);
 954	xa_unlock_irq(&mapping->i_pages);
 955	folio_put_refs(folio, nr);
 956	BUG_ON(error);
 957}
 958
 959/*
 960 * Remove swap entry from page cache, free the swap and its page cache. Returns
 961 * the number of pages being freed. 0 means entry not found in XArray (0 pages
 962 * being freed).
 963 */
 964static long shmem_free_swap(struct address_space *mapping,
 965			    pgoff_t index, void *radswap)
 966{
 967	int order = xa_get_order(&mapping->i_pages, index);
 968	void *old;
 969
 970	old = xa_cmpxchg_irq(&mapping->i_pages, index, radswap, NULL, 0);
 971	if (old != radswap)
 972		return 0;
 973	free_swap_and_cache_nr(radix_to_swp_entry(radswap), 1 << order);
 974
 975	return 1 << order;
 976}
 977
 978/*
 979 * Determine (in bytes) how many of the shmem object's pages mapped by the
 980 * given offsets are swapped out.
 981 *
 982 * This is safe to call without i_rwsem or the i_pages lock thanks to RCU,
 983 * as long as the inode doesn't go away and racy results are not a problem.
 984 */
 985unsigned long shmem_partial_swap_usage(struct address_space *mapping,
 986						pgoff_t start, pgoff_t end)
 987{
 988	XA_STATE(xas, &mapping->i_pages, start);
 989	struct folio *folio;
 990	unsigned long swapped = 0;
 991	unsigned long max = end - 1;
 992
 993	rcu_read_lock();
 994	xas_for_each(&xas, folio, max) {
 995		if (xas_retry(&xas, folio))
 996			continue;
 997		if (xa_is_value(folio))
 998			swapped += 1 << xas_get_order(&xas);
 999		if (xas.xa_index == max)
1000			break;
1001		if (need_resched()) {
1002			xas_pause(&xas);
1003			cond_resched_rcu();
1004		}
1005	}
1006	rcu_read_unlock();
1007
1008	return swapped << PAGE_SHIFT;
1009}
1010
1011/*
1012 * Determine (in bytes) how many of the shmem object's pages mapped by the
1013 * given vma is swapped out.
1014 *
1015 * This is safe to call without i_rwsem or the i_pages lock thanks to RCU,
1016 * as long as the inode doesn't go away and racy results are not a problem.
1017 */
1018unsigned long shmem_swap_usage(struct vm_area_struct *vma)
1019{
1020	struct inode *inode = file_inode(vma->vm_file);
1021	struct shmem_inode_info *info = SHMEM_I(inode);
1022	struct address_space *mapping = inode->i_mapping;
1023	unsigned long swapped;
1024
1025	/* Be careful as we don't hold info->lock */
1026	swapped = READ_ONCE(info->swapped);
1027
1028	/*
1029	 * The easier cases are when the shmem object has nothing in swap, or
1030	 * the vma maps it whole. Then we can simply use the stats that we
1031	 * already track.
1032	 */
1033	if (!swapped)
1034		return 0;
1035
1036	if (!vma->vm_pgoff && vma->vm_end - vma->vm_start >= inode->i_size)
1037		return swapped << PAGE_SHIFT;
1038
1039	/* Here comes the more involved part */
1040	return shmem_partial_swap_usage(mapping, vma->vm_pgoff,
1041					vma->vm_pgoff + vma_pages(vma));
1042}
1043
1044/*
1045 * SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists.
1046 */
1047void shmem_unlock_mapping(struct address_space *mapping)
1048{
1049	struct folio_batch fbatch;
1050	pgoff_t index = 0;
1051
1052	folio_batch_init(&fbatch);
1053	/*
1054	 * Minor point, but we might as well stop if someone else SHM_LOCKs it.
1055	 */
1056	while (!mapping_unevictable(mapping) &&
1057	       filemap_get_folios(mapping, &index, ~0UL, &fbatch)) {
1058		check_move_unevictable_folios(&fbatch);
1059		folio_batch_release(&fbatch);
1060		cond_resched();
1061	}
1062}
1063
1064static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index)
1065{
1066	struct folio *folio;
1067
1068	/*
1069	 * At first avoid shmem_get_folio(,,,SGP_READ): that fails
1070	 * beyond i_size, and reports fallocated folios as holes.
1071	 */
1072	folio = filemap_get_entry(inode->i_mapping, index);
1073	if (!folio)
1074		return folio;
1075	if (!xa_is_value(folio)) {
1076		folio_lock(folio);
1077		if (folio->mapping == inode->i_mapping)
1078			return folio;
1079		/* The folio has been swapped out */
1080		folio_unlock(folio);
1081		folio_put(folio);
1082	}
1083	/*
1084	 * But read a folio back from swap if any of it is within i_size
1085	 * (although in some cases this is just a waste of time).
1086	 */
1087	folio = NULL;
1088	shmem_get_folio(inode, index, 0, &folio, SGP_READ);
1089	return folio;
1090}
1091
1092/*
1093 * Remove range of pages and swap entries from page cache, and free them.
1094 * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
1095 */
1096static void shmem_undo_range(struct inode *inode, loff_t lstart, uoff_t lend,
1097								 bool unfalloc)
1098{
1099	struct address_space *mapping = inode->i_mapping;
1100	struct shmem_inode_info *info = SHMEM_I(inode);
1101	pgoff_t start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT;
1102	pgoff_t end = (lend + 1) >> PAGE_SHIFT;
1103	struct folio_batch fbatch;
1104	pgoff_t indices[PAGEVEC_SIZE];
1105	struct folio *folio;
1106	bool same_folio;
1107	long nr_swaps_freed = 0;
1108	pgoff_t index;
1109	int i;
1110
1111	if (lend == -1)
1112		end = -1;	/* unsigned, so actually very big */
1113
1114	if (info->fallocend > start && info->fallocend <= end && !unfalloc)
1115		info->fallocend = start;
1116
1117	folio_batch_init(&fbatch);
1118	index = start;
1119	while (index < end && find_lock_entries(mapping, &index, end - 1,
1120			&fbatch, indices)) {
1121		for (i = 0; i < folio_batch_count(&fbatch); i++) {
1122			folio = fbatch.folios[i];
1123
1124			if (xa_is_value(folio)) {
1125				if (unfalloc)
1126					continue;
1127				nr_swaps_freed += shmem_free_swap(mapping,
1128							indices[i], folio);
1129				continue;
1130			}
1131
1132			if (!unfalloc || !folio_test_uptodate(folio))
1133				truncate_inode_folio(mapping, folio);
1134			folio_unlock(folio);
1135		}
1136		folio_batch_remove_exceptionals(&fbatch);
1137		folio_batch_release(&fbatch);
1138		cond_resched();
1139	}
1140
1141	/*
1142	 * When undoing a failed fallocate, we want none of the partial folio
1143	 * zeroing and splitting below, but shall want to truncate the whole
1144	 * folio when !uptodate indicates that it was added by this fallocate,
1145	 * even when [lstart, lend] covers only a part of the folio.
1146	 */
1147	if (unfalloc)
1148		goto whole_folios;
1149
1150	same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT);
1151	folio = shmem_get_partial_folio(inode, lstart >> PAGE_SHIFT);
1152	if (folio) {
1153		same_folio = lend < folio_next_pos(folio);
1154		folio_mark_dirty(folio);
1155		if (!truncate_inode_partial_folio(folio, lstart, lend)) {
1156			start = folio_next_index(folio);
1157			if (same_folio)
1158				end = folio->index;
1159		}
1160		folio_unlock(folio);
1161		folio_put(folio);
1162		folio = NULL;
1163	}
1164
1165	if (!same_folio)
1166		folio = shmem_get_partial_folio(inode, lend >> PAGE_SHIFT);
1167	if (folio) {
1168		folio_mark_dirty(folio);
1169		if (!truncate_inode_partial_folio(folio, lstart, lend))
1170			end = folio->index;
1171		folio_unlock(folio);
1172		folio_put(folio);
1173	}
1174
1175whole_folios:
1176
1177	index = start;
1178	while (index < end) {
1179		cond_resched();
1180
1181		if (!find_get_entries(mapping, &index, end - 1, &fbatch,
1182				indices)) {
1183			/* If all gone or hole-punch or unfalloc, we're done */
1184			if (index == start || end != -1)
1185				break;
1186			/* But if truncating, restart to make sure all gone */
1187			index = start;
1188			continue;
1189		}
1190		for (i = 0; i < folio_batch_count(&fbatch); i++) {
1191			folio = fbatch.folios[i];
1192
1193			if (xa_is_value(folio)) {
1194				long swaps_freed;
1195
1196				if (unfalloc)
1197					continue;
1198				swaps_freed = shmem_free_swap(mapping, indices[i], folio);
1199				if (!swaps_freed) {
1200					/* Swap was replaced by page: retry */
1201					index = indices[i];
1202					break;
1203				}
1204				nr_swaps_freed += swaps_freed;
1205				continue;
1206			}
1207
1208			folio_lock(folio);
1209
1210			if (!unfalloc || !folio_test_uptodate(folio)) {
1211				if (folio_mapping(folio) != mapping) {
1212					/* Page was replaced by swap: retry */
1213					folio_unlock(folio);
1214					index = indices[i];
1215					break;
1216				}
1217				VM_BUG_ON_FOLIO(folio_test_writeback(folio),
1218						folio);
1219
1220				if (!folio_test_large(folio)) {
1221					truncate_inode_folio(mapping, folio);
1222				} else if (truncate_inode_partial_folio(folio, lstart, lend)) {
1223					/*
1224					 * If we split a page, reset the loop so
1225					 * that we pick up the new sub pages.
1226					 * Otherwise the THP was entirely
1227					 * dropped or the target range was
1228					 * zeroed, so just continue the loop as
1229					 * is.
1230					 */
1231					if (!folio_test_large(folio)) {
1232						folio_unlock(folio);
1233						index = start;
1234						break;
1235					}
1236				}
1237			}
1238			folio_unlock(folio);
1239		}
1240		folio_batch_remove_exceptionals(&fbatch);
1241		folio_batch_release(&fbatch);
1242	}
1243
1244	shmem_recalc_inode(inode, 0, -nr_swaps_freed);
1245}
1246
1247void shmem_truncate_range(struct inode *inode, loff_t lstart, uoff_t lend)
1248{
1249	shmem_undo_range(inode, lstart, lend, false);
1250	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
1251	inode_inc_iversion(inode);
1252}
1253EXPORT_SYMBOL_GPL(shmem_truncate_range);
1254
1255static int shmem_getattr(struct mnt_idmap *idmap,
1256			 const struct path *path, struct kstat *stat,
1257			 u32 request_mask, unsigned int query_flags)
1258{
1259	struct inode *inode = path->dentry->d_inode;
1260	struct shmem_inode_info *info = SHMEM_I(inode);
1261
1262	if (info->alloced - info->swapped != inode->i_mapping->nrpages)
1263		shmem_recalc_inode(inode, 0, 0);
1264
1265	if (info->fsflags & FS_APPEND_FL)
1266		stat->attributes |= STATX_ATTR_APPEND;
1267	if (info->fsflags & FS_IMMUTABLE_FL)
1268		stat->attributes |= STATX_ATTR_IMMUTABLE;
1269	if (info->fsflags & FS_NODUMP_FL)
1270		stat->attributes |= STATX_ATTR_NODUMP;
1271	stat->attributes_mask |= (STATX_ATTR_APPEND |
1272			STATX_ATTR_IMMUTABLE |
1273			STATX_ATTR_NODUMP);
1274	generic_fillattr(idmap, request_mask, inode, stat);
1275
1276	if (shmem_huge_global_enabled(inode, 0, 0, false, NULL, 0))
1277		stat->blksize = HPAGE_PMD_SIZE;
1278
1279	if (request_mask & STATX_BTIME) {
1280		stat->result_mask |= STATX_BTIME;
1281		stat->btime.tv_sec = info->i_crtime.tv_sec;
1282		stat->btime.tv_nsec = info->i_crtime.tv_nsec;
1283	}
1284
1285	return 0;
1286}
1287
1288static int shmem_setattr(struct mnt_idmap *idmap,
1289			 struct dentry *dentry, struct iattr *attr)
1290{
1291	struct inode *inode = d_inode(dentry);
1292	struct shmem_inode_info *info = SHMEM_I(inode);
1293	int error;
1294	bool update_mtime = false;
1295	bool update_ctime = true;
1296
1297	error = setattr_prepare(idmap, dentry, attr);
1298	if (error)
1299		return error;
1300
1301	if ((info->seals & F_SEAL_EXEC) && (attr->ia_valid & ATTR_MODE)) {
1302		if ((inode->i_mode ^ attr->ia_mode) & 0111) {
1303			return -EPERM;
1304		}
1305	}
1306
1307	if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
1308		loff_t oldsize = inode->i_size;
1309		loff_t newsize = attr->ia_size;
1310
1311		/* protected by i_rwsem */
1312		if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) ||
1313		    (newsize > oldsize && (info->seals & F_SEAL_GROW)))
1314			return -EPERM;
1315
1316		if (newsize != oldsize) {
1317			if (info->flags & SHMEM_F_MAPPING_FROZEN)
1318				return -EPERM;
1319			error = shmem_reacct_size(SHMEM_I(inode)->flags,
1320					oldsize, newsize);
1321			if (error)
1322				return error;
1323			i_size_write(inode, newsize);
1324			update_mtime = true;
1325		} else {
1326			update_ctime = false;
1327		}
1328		if (newsize <= oldsize) {
1329			loff_t holebegin = round_up(newsize, PAGE_SIZE);
1330			if (oldsize > holebegin)
1331				unmap_mapping_range(inode->i_mapping,
1332							holebegin, 0, 1);
1333			if (info->alloced)
1334				shmem_truncate_range(inode,
1335							newsize, (loff_t)-1);
1336			/* unmap again to remove racily COWed private pages */
1337			if (oldsize > holebegin)
1338				unmap_mapping_range(inode->i_mapping,
1339							holebegin, 0, 1);
1340		}
1341	}
1342
1343	if (is_quota_modification(idmap, inode, attr)) {
1344		error = dquot_initialize(inode);
1345		if (error)
1346			return error;
1347	}
1348
1349	/* Transfer quota accounting */
1350	if (i_uid_needs_update(idmap, attr, inode) ||
1351	    i_gid_needs_update(idmap, attr, inode)) {
1352		error = dquot_transfer(idmap, inode, attr);
1353		if (error)
1354			return error;
1355	}
1356
1357	setattr_copy(idmap, inode, attr);
1358	if (attr->ia_valid & ATTR_MODE)
1359		error = posix_acl_chmod(idmap, dentry, inode->i_mode);
1360	if (!error && update_ctime) {
1361		inode_set_ctime_current(inode);
1362		if (update_mtime)
1363			inode_set_mtime_to_ts(inode, inode_get_ctime(inode));
1364		inode_inc_iversion(inode);
1365	}
1366	return error;
1367}
1368
1369static void shmem_evict_inode(struct inode *inode)
1370{
1371	struct shmem_inode_info *info = SHMEM_I(inode);
1372	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
1373	size_t freed = 0;
1374
1375	if (shmem_mapping(inode->i_mapping)) {
1376		shmem_unacct_size(info->flags, inode->i_size);
1377		inode->i_size = 0;
1378		mapping_set_exiting(inode->i_mapping);
1379		shmem_truncate_range(inode, 0, (loff_t)-1);
1380		if (!list_empty(&info->shrinklist)) {
1381			spin_lock(&sbinfo->shrinklist_lock);
1382			if (!list_empty(&info->shrinklist)) {
1383				list_del_init(&info->shrinklist);
1384				sbinfo->shrinklist_len--;
1385			}
1386			spin_unlock(&sbinfo->shrinklist_lock);
1387		}
1388		while (!list_empty(&info->swaplist)) {
1389			/* Wait while shmem_unuse() is scanning this inode... */
1390			wait_var_event(&info->stop_eviction,
1391				       !atomic_read(&info->stop_eviction));
1392			spin_lock(&shmem_swaplist_lock);
1393			/* ...but beware of the race if we peeked too early */
1394			if (!atomic_read(&info->stop_eviction))
1395				list_del_init(&info->swaplist);
1396			spin_unlock(&shmem_swaplist_lock);
1397		}
1398	}
1399
1400	simple_xattrs_free(&info->xattrs, sbinfo->max_inodes ? &freed : NULL);
1401	shmem_free_inode(inode->i_sb, freed);
1402	WARN_ON(inode->i_blocks);
1403	clear_inode(inode);
1404#ifdef CONFIG_TMPFS_QUOTA
1405	dquot_free_inode(inode);
1406	dquot_drop(inode);
1407#endif
1408}
1409
1410static unsigned int shmem_find_swap_entries(struct address_space *mapping,
1411				pgoff_t start, struct folio_batch *fbatch,
1412				pgoff_t *indices, unsigned int type)
1413{
1414	XA_STATE(xas, &mapping->i_pages, start);
1415	struct folio *folio;
1416	swp_entry_t entry;
1417
1418	rcu_read_lock();
1419	xas_for_each(&xas, folio, ULONG_MAX) {
1420		if (xas_retry(&xas, folio))
1421			continue;
1422
1423		if (!xa_is_value(folio))
1424			continue;
1425
1426		entry = radix_to_swp_entry(folio);
1427		/*
1428		 * swapin error entries can be found in the mapping. But they're
1429		 * deliberately ignored here as we've done everything we can do.
1430		 */
1431		if (swp_type(entry) != type)
1432			continue;
1433
1434		indices[folio_batch_count(fbatch)] = xas.xa_index;
1435		if (!folio_batch_add(fbatch, folio))
1436			break;
1437
1438		if (need_resched()) {
1439			xas_pause(&xas);
1440			cond_resched_rcu();
1441		}
1442	}
1443	rcu_read_unlock();
1444
1445	return folio_batch_count(fbatch);
1446}
1447
1448/*
1449 * Move the swapped pages for an inode to page cache. Returns the count
1450 * of pages swapped in, or the error in case of failure.
1451 */
1452static int shmem_unuse_swap_entries(struct inode *inode,
1453		struct folio_batch *fbatch, pgoff_t *indices)
1454{
1455	int i = 0;
1456	int ret = 0;
1457	int error = 0;
1458	struct address_space *mapping = inode->i_mapping;
1459
1460	for (i = 0; i < folio_batch_count(fbatch); i++) {
1461		struct folio *folio = fbatch->folios[i];
1462
1463		error = shmem_swapin_folio(inode, indices[i], &folio, SGP_CACHE,
1464					mapping_gfp_mask(mapping), NULL, NULL);
1465		if (error == 0) {
1466			folio_unlock(folio);
1467			folio_put(folio);
1468			ret++;
1469		}
1470		if (error == -ENOMEM)
1471			break;
1472		error = 0;
1473	}
1474	return error ? error : ret;
1475}
1476
1477/*
1478 * If swap found in inode, free it and move page from swapcache to filecache.
1479 */
1480static int shmem_unuse_inode(struct inode *inode, unsigned int type)
1481{
1482	struct address_space *mapping = inode->i_mapping;
1483	pgoff_t start = 0;
1484	struct folio_batch fbatch;
1485	pgoff_t indices[PAGEVEC_SIZE];
1486	int ret = 0;
1487
1488	do {
1489		folio_batch_init(&fbatch);
1490		if (!shmem_find_swap_entries(mapping, start, &fbatch,
1491					     indices, type)) {
1492			ret = 0;
1493			break;
1494		}
1495
1496		ret = shmem_unuse_swap_entries(inode, &fbatch, indices);
1497		if (ret < 0)
1498			break;
1499
1500		start = indices[folio_batch_count(&fbatch) - 1];
1501	} while (true);
1502
1503	return ret;
1504}
1505
1506/*
1507 * Read all the shared memory data that resides in the swap
1508 * device 'type' back into memory, so the swap device can be
1509 * unused.
1510 */
1511int shmem_unuse(unsigned int type)
1512{
1513	struct shmem_inode_info *info, *next;
1514	int error = 0;
1515
1516	if (list_empty(&shmem_swaplist))
1517		return 0;
1518
1519	spin_lock(&shmem_swaplist_lock);
1520start_over:
1521	list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) {
1522		if (!info->swapped) {
1523			list_del_init(&info->swaplist);
1524			continue;
1525		}
1526		/*
1527		 * Drop the swaplist mutex while searching the inode for swap;
1528		 * but before doing so, make sure shmem_evict_inode() will not
1529		 * remove placeholder inode from swaplist, nor let it be freed
1530		 * (igrab() would protect from unlink, but not from unmount).
1531		 */
1532		atomic_inc(&info->stop_eviction);
1533		spin_unlock(&shmem_swaplist_lock);
1534
1535		error = shmem_unuse_inode(&info->vfs_inode, type);
1536		cond_resched();
1537
1538		spin_lock(&shmem_swaplist_lock);
1539		if (atomic_dec_and_test(&info->stop_eviction))
1540			wake_up_var(&info->stop_eviction);
1541		if (error)
1542			break;
1543		if (list_empty(&info->swaplist))
1544			goto start_over;
1545		next = list_next_entry(info, swaplist);
1546		if (!info->swapped)
1547			list_del_init(&info->swaplist);
1548	}
1549	spin_unlock(&shmem_swaplist_lock);
1550
1551	return error;
1552}
1553
1554/**
1555 * shmem_writeout - Write the folio to swap
1556 * @folio: The folio to write
1557 * @plug: swap plug
1558 * @folio_list: list to put back folios on split
1559 *
1560 * Move the folio from the page cache to the swap cache.
1561 */
1562int shmem_writeout(struct folio *folio, struct swap_iocb **plug,
1563		struct list_head *folio_list)
1564{
1565	struct address_space *mapping = folio->mapping;
1566	struct inode *inode = mapping->host;
1567	struct shmem_inode_info *info = SHMEM_I(inode);
1568	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
1569	pgoff_t index;
1570	int nr_pages;
1571	bool split = false;
1572
1573	if ((info->flags & SHMEM_F_LOCKED) || sbinfo->noswap)
1574		goto redirty;
1575
1576	if (!total_swap_pages)
1577		goto redirty;
1578
1579	/*
1580	 * If CONFIG_THP_SWAP is not enabled, the large folio should be
1581	 * split when swapping.
1582	 *
1583	 * And shrinkage of pages beyond i_size does not split swap, so
1584	 * swapout of a large folio crossing i_size needs to split too
1585	 * (unless fallocate has been used to preallocate beyond EOF).
1586	 */
1587	if (folio_test_large(folio)) {
1588		index = shmem_fallocend(inode,
1589			DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE));
1590		if ((index > folio->index && index < folio_next_index(folio)) ||
1591		    !IS_ENABLED(CONFIG_THP_SWAP))
1592			split = true;
1593	}
1594
1595	if (split) {
1596try_split:
1597		/* Ensure the subpages are still dirty */
1598		folio_test_set_dirty(folio);
1599		if (split_folio_to_list(folio, folio_list))
1600			goto redirty;
1601		folio_clear_dirty(folio);
1602	}
1603
1604	index = folio->index;
1605	nr_pages = folio_nr_pages(folio);
1606
1607	/*
1608	 * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC
1609	 * value into swapfile.c, the only way we can correctly account for a
1610	 * fallocated folio arriving here is now to initialize it and write it.
1611	 *
1612	 * That's okay for a folio already fallocated earlier, but if we have
1613	 * not yet completed the fallocation, then (a) we want to keep track
1614	 * of this folio in case we have to undo it, and (b) it may not be a
1615	 * good idea to continue anyway, once we're pushing into swap.  So
1616	 * reactivate the folio, and let shmem_fallocate() quit when too many.
1617	 */
1618	if (!folio_test_uptodate(folio)) {
1619		if (inode->i_private) {
1620			struct shmem_falloc *shmem_falloc;
1621			spin_lock(&inode->i_lock);
1622			shmem_falloc = inode->i_private;
1623			if (shmem_falloc &&
1624			    !shmem_falloc->waitq &&
1625			    index >= shmem_falloc->start &&
1626			    index < shmem_falloc->next)
1627				shmem_falloc->nr_unswapped += nr_pages;
1628			else
1629				shmem_falloc = NULL;
1630			spin_unlock(&inode->i_lock);
1631			if (shmem_falloc)
1632				goto redirty;
1633		}
1634		folio_zero_range(folio, 0, folio_size(folio));
1635		flush_dcache_folio(folio);
1636		folio_mark_uptodate(folio);
1637	}
1638
1639	if (!folio_alloc_swap(folio)) {
1640		bool first_swapped = shmem_recalc_inode(inode, 0, nr_pages);
1641		int error;
1642
1643		/*
1644		 * Add inode to shmem_unuse()'s list of swapped-out inodes,
1645		 * if it's not already there.  Do it now before the folio is
1646		 * removed from page cache, when its pagelock no longer
1647		 * protects the inode from eviction.  And do it now, after
1648		 * we've incremented swapped, because shmem_unuse() will
1649		 * prune a !swapped inode from the swaplist.
1650		 */
1651		if (first_swapped) {
1652			spin_lock(&shmem_swaplist_lock);
1653			if (list_empty(&info->swaplist))
1654				list_add(&info->swaplist, &shmem_swaplist);
1655			spin_unlock(&shmem_swaplist_lock);
1656		}
1657
1658		swap_shmem_alloc(folio->swap, nr_pages);
1659		shmem_delete_from_page_cache(folio, swp_to_radix_entry(folio->swap));
1660
1661		BUG_ON(folio_mapped(folio));
1662		error = swap_writeout(folio, plug);
1663		if (error != AOP_WRITEPAGE_ACTIVATE) {
1664			/* folio has been unlocked */
1665			return error;
1666		}
1667
1668		/*
1669		 * The intention here is to avoid holding on to the swap when
1670		 * zswap was unable to compress and unable to writeback; but
1671		 * it will be appropriate if other reactivate cases are added.
1672		 */
1673		error = shmem_add_to_page_cache(folio, mapping, index,
1674				swp_to_radix_entry(folio->swap),
1675				__GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
1676		/* Swap entry might be erased by racing shmem_free_swap() */
1677		if (!error) {
1678			shmem_recalc_inode(inode, 0, -nr_pages);
1679			swap_free_nr(folio->swap, nr_pages);
1680		}
1681
1682		/*
1683		 * The swap_cache_del_folio() below could be left for
1684		 * shrink_folio_list()'s folio_free_swap() to dispose of;
1685		 * but I'm a little nervous about letting this folio out of
1686		 * shmem_writeout() in a hybrid half-tmpfs-half-swap state
1687		 * e.g. folio_mapping(folio) might give an unexpected answer.
1688		 */
1689		swap_cache_del_folio(folio);
1690		goto redirty;
1691	}
1692	if (nr_pages > 1)
1693		goto try_split;
1694redirty:
1695	folio_mark_dirty(folio);
1696	return AOP_WRITEPAGE_ACTIVATE;	/* Return with folio locked */
1697}
1698EXPORT_SYMBOL_GPL(shmem_writeout);
1699
1700#if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS)
1701static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
1702{
1703	char buffer[64];
1704
1705	if (!mpol || mpol->mode == MPOL_DEFAULT)
1706		return;		/* show nothing */
1707
1708	mpol_to_str(buffer, sizeof(buffer), mpol);
1709
1710	seq_printf(seq, ",mpol=%s", buffer);
1711}
1712
1713static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
1714{
1715	struct mempolicy *mpol = NULL;
1716	if (sbinfo->mpol) {
1717		raw_spin_lock(&sbinfo->stat_lock);	/* prevent replace/use races */
1718		mpol = sbinfo->mpol;
1719		mpol_get(mpol);
1720		raw_spin_unlock(&sbinfo->stat_lock);
1721	}
1722	return mpol;
1723}
1724#else /* !CONFIG_NUMA || !CONFIG_TMPFS */
1725static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
1726{
1727}
1728static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
1729{
1730	return NULL;
1731}
1732#endif /* CONFIG_NUMA && CONFIG_TMPFS */
1733
1734static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info,
1735			pgoff_t index, unsigned int order, pgoff_t *ilx);
1736
1737static struct folio *shmem_swapin_cluster(swp_entry_t swap, gfp_t gfp,
1738			struct shmem_inode_info *info, pgoff_t index)
1739{
1740	struct mempolicy *mpol;
1741	pgoff_t ilx;
1742	struct folio *folio;
1743
1744	mpol = shmem_get_pgoff_policy(info, index, 0, &ilx);
1745	folio = swap_cluster_readahead(swap, gfp, mpol, ilx);
1746	mpol_cond_put(mpol);
1747
1748	return folio;
1749}
1750
1751/*
1752 * Make sure huge_gfp is always more limited than limit_gfp.
1753 * Some of the flags set permissions, while others set limitations.
1754 */
1755static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp)
1756{
1757	gfp_t allowflags = __GFP_IO | __GFP_FS | __GFP_RECLAIM;
1758	gfp_t denyflags = __GFP_NOWARN | __GFP_NORETRY;
1759	gfp_t zoneflags = limit_gfp & GFP_ZONEMASK;
1760	gfp_t result = huge_gfp & ~(allowflags | GFP_ZONEMASK);
1761
1762	/* Allow allocations only from the originally specified zones. */
1763	result |= zoneflags;
1764
1765	/*
1766	 * Minimize the result gfp by taking the union with the deny flags,
1767	 * and the intersection of the allow flags.
1768	 */
1769	result |= (limit_gfp & denyflags);
1770	result |= (huge_gfp & limit_gfp) & allowflags;
1771
1772	return result;
1773}
1774
1775#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1776bool shmem_hpage_pmd_enabled(void)
1777{
1778	if (shmem_huge == SHMEM_HUGE_DENY)
1779		return false;
1780	if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_always))
1781		return true;
1782	if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_madvise))
1783		return true;
1784	if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_within_size))
1785		return true;
1786	if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_inherit) &&
1787	    shmem_huge != SHMEM_HUGE_NEVER)
1788		return true;
1789
1790	return false;
1791}
1792
1793unsigned long shmem_allowable_huge_orders(struct inode *inode,
1794				struct vm_area_struct *vma, pgoff_t index,
1795				loff_t write_end, bool shmem_huge_force)
1796{
1797	unsigned long mask = READ_ONCE(huge_shmem_orders_always);
1798	unsigned long within_size_orders = READ_ONCE(huge_shmem_orders_within_size);
1799	vm_flags_t vm_flags = vma ? vma->vm_flags : 0;
1800	unsigned int global_orders;
1801
1802	if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, vm_flags, shmem_huge_force)))
1803		return 0;
1804
1805	global_orders = shmem_huge_global_enabled(inode, index, write_end,
1806						  shmem_huge_force, vma, vm_flags);
1807	/* Tmpfs huge pages allocation */
1808	if (!vma || !vma_is_anon_shmem(vma))
1809		return global_orders;
1810
1811	/*
1812	 * Following the 'deny' semantics of the top level, force the huge
1813	 * option off from all mounts.
1814	 */
1815	if (shmem_huge == SHMEM_HUGE_DENY)
1816		return 0;
1817
1818	/*
1819	 * Only allow inherit orders if the top-level value is 'force', which
1820	 * means non-PMD sized THP can not override 'huge' mount option now.
1821	 */
1822	if (shmem_huge == SHMEM_HUGE_FORCE)
1823		return READ_ONCE(huge_shmem_orders_inherit);
1824
1825	/* Allow mTHP that will be fully within i_size. */
1826	mask |= shmem_get_orders_within_size(inode, within_size_orders, index, 0);
1827
1828	if (vm_flags & VM_HUGEPAGE)
1829		mask |= READ_ONCE(huge_shmem_orders_madvise);
1830
1831	if (global_orders > 0)
1832		mask |= READ_ONCE(huge_shmem_orders_inherit);
1833
1834	return THP_ORDERS_ALL_FILE_DEFAULT & mask;
1835}
1836
1837static unsigned long shmem_suitable_orders(struct inode *inode, struct vm_fault *vmf,
1838					   struct address_space *mapping, pgoff_t index,
1839					   unsigned long orders)
1840{
1841	struct vm_area_struct *vma = vmf ? vmf->vma : NULL;
1842	pgoff_t aligned_index;
1843	unsigned long pages;
1844	int order;
1845
1846	if (vma) {
1847		orders = thp_vma_suitable_orders(vma, vmf->address, orders);
1848		if (!orders)
1849			return 0;
1850	}
1851
1852	/* Find the highest order that can add into the page cache */
1853	order = highest_order(orders);
1854	while (orders) {
1855		pages = 1UL << order;
1856		aligned_index = round_down(index, pages);
1857		/*
1858		 * Check for conflict before waiting on a huge allocation.
1859		 * Conflict might be that a huge page has just been allocated
1860		 * and added to page cache by a racing thread, or that there
1861		 * is already at least one small page in the huge extent.
1862		 * Be careful to retry when appropriate, but not forever!
1863		 * Elsewhere -EEXIST would be the right code, but not here.
1864		 */
1865		if (!xa_find(&mapping->i_pages, &aligned_index,
1866			     aligned_index + pages - 1, XA_PRESENT))
1867			break;
1868		order = next_order(&orders, order);
1869	}
1870
1871	return orders;
1872}
1873#else
1874static unsigned long shmem_suitable_orders(struct inode *inode, struct vm_fault *vmf,
1875					   struct address_space *mapping, pgoff_t index,
1876					   unsigned long orders)
1877{
1878	return 0;
1879}
1880#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
1881
1882static struct folio *shmem_alloc_folio(gfp_t gfp, int order,
1883		struct shmem_inode_info *info, pgoff_t index)
1884{
1885	struct mempolicy *mpol;
1886	pgoff_t ilx;
1887	struct folio *folio;
1888
1889	mpol = shmem_get_pgoff_policy(info, index, order, &ilx);
1890	folio = folio_alloc_mpol(gfp, order, mpol, ilx, numa_node_id());
1891	mpol_cond_put(mpol);
1892
1893	return folio;
1894}
1895
1896static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf,
1897		gfp_t gfp, struct inode *inode, pgoff_t index,
1898		struct mm_struct *fault_mm, unsigned long orders)
1899{
1900	struct address_space *mapping = inode->i_mapping;
1901	struct shmem_inode_info *info = SHMEM_I(inode);
1902	unsigned long suitable_orders = 0;
1903	struct folio *folio = NULL;
1904	pgoff_t aligned_index;
1905	long pages;
1906	int error, order;
1907
1908	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
1909		orders = 0;
1910
1911	if (orders > 0) {
1912		suitable_orders = shmem_suitable_orders(inode, vmf,
1913							mapping, index, orders);
1914
1915		order = highest_order(suitable_orders);
1916		while (suitable_orders) {
1917			pages = 1UL << order;
1918			aligned_index = round_down(index, pages);
1919			folio = shmem_alloc_folio(gfp, order, info, aligned_index);
1920			if (folio) {
1921				index = aligned_index;
1922				goto allocated;
1923			}
1924
1925			if (pages == HPAGE_PMD_NR)
1926				count_vm_event(THP_FILE_FALLBACK);
1927			count_mthp_stat(order, MTHP_STAT_SHMEM_FALLBACK);
1928			order = next_order(&suitable_orders, order);
1929		}
1930	} else {
1931		pages = 1;
1932		folio = shmem_alloc_folio(gfp, 0, info, index);
1933	}
1934	if (!folio)
1935		return ERR_PTR(-ENOMEM);
1936
1937allocated:
1938	__folio_set_locked(folio);
1939	__folio_set_swapbacked(folio);
1940
1941	gfp &= GFP_RECLAIM_MASK;
1942	error = mem_cgroup_charge(folio, fault_mm, gfp);
1943	if (error) {
1944		if (xa_find(&mapping->i_pages, &index,
1945				index + pages - 1, XA_PRESENT)) {
1946			error = -EEXIST;
1947		} else if (pages > 1) {
1948			if (pages == HPAGE_PMD_NR) {
1949				count_vm_event(THP_FILE_FALLBACK);
1950				count_vm_event(THP_FILE_FALLBACK_CHARGE);
1951			}
1952			count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_FALLBACK);
1953			count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_FALLBACK_CHARGE);
1954		}
1955		goto unlock;
1956	}
1957
1958	error = shmem_add_to_page_cache(folio, mapping, index, NULL, gfp);
1959	if (error)
1960		goto unlock;
1961
1962	error = shmem_inode_acct_blocks(inode, pages);
1963	if (error) {
1964		struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
1965		long freed;
1966		/*
1967		 * Try to reclaim some space by splitting a few
1968		 * large folios beyond i_size on the filesystem.
1969		 */
1970		shmem_unused_huge_shrink(sbinfo, NULL, pages);
1971		/*
1972		 * And do a shmem_recalc_inode() to account for freed pages:
1973		 * except our folio is there in cache, so not quite balanced.
1974		 */
1975		spin_lock(&info->lock);
1976		freed = pages + info->alloced - info->swapped -
1977			READ_ONCE(mapping->nrpages);
1978		if (freed > 0)
1979			info->alloced -= freed;
1980		spin_unlock(&info->lock);
1981		if (freed > 0)
1982			shmem_inode_unacct_blocks(inode, freed);
1983		error = shmem_inode_acct_blocks(inode, pages);
1984		if (error) {
1985			filemap_remove_folio(folio);
1986			goto unlock;
1987		}
1988	}
1989
1990	shmem_recalc_inode(inode, pages, 0);
1991	folio_add_lru(folio);
1992	return folio;
1993
1994unlock:
1995	folio_unlock(folio);
1996	folio_put(folio);
1997	return ERR_PTR(error);
1998}
1999
2000static struct folio *shmem_swap_alloc_folio(struct inode *inode,
2001		struct vm_area_struct *vma, pgoff_t index,
2002		swp_entry_t entry, int order, gfp_t gfp)
2003{
2004	struct shmem_inode_info *info = SHMEM_I(inode);
2005	int nr_pages = 1 << order;
2006	struct folio *new;
2007	gfp_t alloc_gfp;
2008	void *shadow;
2009
2010	/*
2011	 * We have arrived here because our zones are constrained, so don't
2012	 * limit chance of success with further cpuset and node constraints.
2013	 */
2014	gfp &= ~GFP_CONSTRAINT_MASK;
2015	alloc_gfp = gfp;
2016	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
2017		if (WARN_ON_ONCE(order))
2018			return ERR_PTR(-EINVAL);
2019	} else if (order) {
2020		/*
2021		 * If uffd is active for the vma, we need per-page fault
2022		 * fidelity to maintain the uffd semantics, then fallback
2023		 * to swapin order-0 folio, as well as for zswap case.
2024		 * Any existing sub folio in the swap cache also blocks
2025		 * mTHP swapin.
2026		 */
2027		if ((vma && unlikely(userfaultfd_armed(vma))) ||
2028		     !zswap_never_enabled() ||
2029		     non_swapcache_batch(entry, nr_pages) != nr_pages)
2030			goto fallback;
2031
2032		alloc_gfp = limit_gfp_mask(vma_thp_gfp_mask(vma), gfp);
2033	}
2034retry:
2035	new = shmem_alloc_folio(alloc_gfp, order, info, index);
2036	if (!new) {
2037		new = ERR_PTR(-ENOMEM);
2038		goto fallback;
2039	}
2040
2041	if (mem_cgroup_swapin_charge_folio(new, vma ? vma->vm_mm : NULL,
2042					   alloc_gfp, entry)) {
2043		folio_put(new);
2044		new = ERR_PTR(-ENOMEM);
2045		goto fallback;
2046	}
2047
2048	/*
2049	 * Prevent parallel swapin from proceeding with the swap cache flag.
2050	 *
2051	 * Of course there is another possible concurrent scenario as well,
2052	 * that is to say, the swap cache flag of a large folio has already
2053	 * been set by swapcache_prepare(), while another thread may have
2054	 * already split the large swap entry stored in the shmem mapping.
2055	 * In this case, shmem_add_to_page_cache() will help identify the
2056	 * concurrent swapin and return -EEXIST.
2057	 */
2058	if (swapcache_prepare(entry, nr_pages)) {
2059		folio_put(new);
2060		new = ERR_PTR(-EEXIST);
2061		/* Try smaller folio to avoid cache conflict */
2062		goto fallback;
2063	}
2064
2065	__folio_set_locked(new);
2066	__folio_set_swapbacked(new);
2067	new->swap = entry;
2068
2069	memcg1_swapin(entry, nr_pages);
2070	shadow = swap_cache_get_shadow(entry);
2071	if (shadow)
2072		workingset_refault(new, shadow);
2073	folio_add_lru(new);
2074	swap_read_folio(new, NULL);
2075	return new;
2076fallback:
2077	/* Order 0 swapin failed, nothing to fallback to, abort */
2078	if (!order)
2079		return new;
2080	entry.val += index - round_down(index, nr_pages);
2081	alloc_gfp = gfp;
2082	nr_pages = 1;
2083	order = 0;
2084	goto retry;
2085}
2086
2087/*
2088 * When a page is moved from swapcache to shmem filecache (either by the
2089 * usual swapin of shmem_get_folio_gfp(), or by the less common swapoff of
2090 * shmem_unuse_inode()), it may have been read in earlier from swap, in
2091 * ignorance of the mapping it belongs to.  If that mapping has special
2092 * constraints (like the gma500 GEM driver, which requires RAM below 4GB),
2093 * we may need to copy to a suitable page before moving to filecache.
2094 *
2095 * In a future release, this may well be extended to respect cpuset and
2096 * NUMA mempolicy, and applied also to anonymous pages in do_swap_page();
2097 * but for now it is a simple matter of zone.
2098 */
2099static bool shmem_should_replace_folio(struct folio *folio, gfp_t gfp)
2100{
2101	return folio_zonenum(folio) > gfp_zone(gfp);
2102}
2103
2104static int shmem_replace_folio(struct folio **foliop, gfp_t gfp,
2105				struct shmem_inode_info *info, pgoff_t index,
2106				struct vm_area_struct *vma)
2107{
2108	struct swap_cluster_info *ci;
2109	struct folio *new, *old = *foliop;
2110	swp_entry_t entry = old->swap;
2111	int nr_pages = folio_nr_pages(old);
2112	int error = 0;
2113
2114	/*
2115	 * We have arrived here because our zones are constrained, so don't
2116	 * limit chance of success by further cpuset and node constraints.
2117	 */
2118	gfp &= ~GFP_CONSTRAINT_MASK;
2119#ifdef CONFIG_TRANSPARENT_HUGEPAGE
2120	if (nr_pages > 1) {
2121		gfp_t huge_gfp = vma_thp_gfp_mask(vma);
2122
2123		gfp = limit_gfp_mask(huge_gfp, gfp);
2124	}
2125#endif
2126
2127	new = shmem_alloc_folio(gfp, folio_order(old), info, index);
2128	if (!new)
2129		return -ENOMEM;
2130
2131	folio_ref_add(new, nr_pages);
2132	folio_copy(new, old);
2133	flush_dcache_folio(new);
2134
2135	__folio_set_locked(new);
2136	__folio_set_swapbacked(new);
2137	folio_mark_uptodate(new);
2138	new->swap = entry;
2139	folio_set_swapcache(new);
2140
2141	ci = swap_cluster_get_and_lock_irq(old);
2142	__swap_cache_replace_folio(ci, old, new);
2143	mem_cgroup_replace_folio(old, new);
2144	shmem_update_stats(new, nr_pages);
2145	shmem_update_stats(old, -nr_pages);
2146	swap_cluster_unlock_irq(ci);
2147
2148	folio_add_lru(new);
2149	*foliop = new;
2150
2151	folio_clear_swapcache(old);
2152	old->private = NULL;
2153
2154	folio_unlock(old);
2155	/*
2156	 * The old folio are removed from swap cache, drop the 'nr_pages'
2157	 * reference, as well as one temporary reference getting from swap
2158	 * cache.
2159	 */
2160	folio_put_refs(old, nr_pages + 1);
2161	return error;
2162}
2163
2164static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
2165					 struct folio *folio, swp_entry_t swap,
2166					 bool skip_swapcache)
2167{
2168	struct address_space *mapping = inode->i_mapping;
2169	swp_entry_t swapin_error;
2170	void *old;
2171	int nr_pages;
2172
2173	swapin_error = make_poisoned_swp_entry();
2174	old = xa_cmpxchg_irq(&mapping->i_pages, index,
2175			     swp_to_radix_entry(swap),
2176			     swp_to_radix_entry(swapin_error), 0);
2177	if (old != swp_to_radix_entry(swap))
2178		return;
2179
2180	nr_pages = folio_nr_pages(folio);
2181	folio_wait_writeback(folio);
2182	if (!skip_swapcache)
2183		swap_cache_del_folio(folio);
2184	/*
2185	 * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks
2186	 * won't be 0 when inode is released and thus trigger WARN_ON(i_blocks)
2187	 * in shmem_evict_inode().
2188	 */
2189	shmem_recalc_inode(inode, -nr_pages, -nr_pages);
2190	swap_free_nr(swap, nr_pages);
2191}
2192
2193static int shmem_split_large_entry(struct inode *inode, pgoff_t index,
2194				   swp_entry_t swap, gfp_t gfp)
2195{
2196	struct address_space *mapping = inode->i_mapping;
2197	XA_STATE_ORDER(xas, &mapping->i_pages, index, 0);
2198	int split_order = 0;
2199	int i;
2200
2201	/* Convert user data gfp flags to xarray node gfp flags */
2202	gfp &= GFP_RECLAIM_MASK;
2203
2204	for (;;) {
2205		void *old = NULL;
2206		int cur_order;
2207		pgoff_t swap_index;
2208
2209		xas_lock_irq(&xas);
2210		old = xas_load(&xas);
2211		if (!xa_is_value(old) || swp_to_radix_entry(swap) != old) {
2212			xas_set_err(&xas, -EEXIST);
2213			goto unlock;
2214		}
2215
2216		cur_order = xas_get_order(&xas);
2217		if (!cur_order)
2218			goto unlock;
2219
2220		/* Try to split large swap entry in pagecache */
2221		swap_index = round_down(index, 1 << cur_order);
2222		split_order = xas_try_split_min_order(cur_order);
2223
2224		while (cur_order > 0) {
2225			pgoff_t aligned_index =
2226				round_down(index, 1 << cur_order);
2227			pgoff_t swap_offset = aligned_index - swap_index;
2228
2229			xas_set_order(&xas, index, split_order);
2230			xas_try_split(&xas, old, cur_order);
2231			if (xas_error(&xas))
2232				goto unlock;
2233
2234			/*
2235			 * Re-set the swap entry after splitting, and the swap
2236			 * offset of the original large entry must be continuous.
2237			 */
2238			for (i = 0; i < 1 << cur_order;
2239			     i += (1 << split_order)) {
2240				swp_entry_t tmp;
2241
2242				tmp = swp_entry(swp_type(swap),
2243						swp_offset(swap) + swap_offset +
2244							i);
2245				__xa_store(&mapping->i_pages, aligned_index + i,
2246					   swp_to_radix_entry(tmp), 0);
2247			}
2248			cur_order = split_order;
2249			split_order = xas_try_split_min_order(split_order);
2250		}
2251
2252unlock:
2253		xas_unlock_irq(&xas);
2254
2255		if (!xas_nomem(&xas, gfp))
2256			break;
2257	}
2258
2259	if (xas_error(&xas))
2260		return xas_error(&xas);
2261
2262	return 0;
2263}
2264
2265/*
2266 * Swap in the folio pointed to by *foliop.
2267 * Caller has to make sure that *foliop contains a valid swapped folio.
2268 * Returns 0 and the folio in foliop if success. On failure, returns the
2269 * error code and NULL in *foliop.
2270 */
2271static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
2272			     struct folio **foliop, enum sgp_type sgp,
2273			     gfp_t gfp, struct vm_area_struct *vma,
2274			     vm_fault_t *fault_type)
2275{
2276	struct address_space *mapping = inode->i_mapping;
2277	struct mm_struct *fault_mm = vma ? vma->vm_mm : NULL;
2278	struct shmem_inode_info *info = SHMEM_I(inode);
2279	swp_entry_t swap;
2280	softleaf_t index_entry;
2281	struct swap_info_struct *si;
2282	struct folio *folio = NULL;
2283	bool skip_swapcache = false;
2284	int error, nr_pages, order;
2285	pgoff_t offset;
2286
2287	VM_BUG_ON(!*foliop || !xa_is_value(*foliop));
2288	index_entry = radix_to_swp_entry(*foliop);
2289	swap = index_entry;
2290	*foliop = NULL;
2291
2292	if (softleaf_is_poison_marker(index_entry))
2293		return -EIO;
2294
2295	si = get_swap_device(index_entry);
2296	order = shmem_confirm_swap(mapping, index, index_entry);
2297	if (unlikely(!si)) {
2298		if (order < 0)
2299			return -EEXIST;
2300		else
2301			return -EINVAL;
2302	}
2303	if (unlikely(order < 0)) {
2304		put_swap_device(si);
2305		return -EEXIST;
2306	}
2307
2308	/* index may point to the middle of a large entry, get the sub entry */
2309	if (order) {
2310		offset = index - round_down(index, 1 << order);
2311		swap = swp_entry(swp_type(swap), swp_offset(swap) + offset);
2312	}
2313
2314	/* Look it up and read it in.. */
2315	folio = swap_cache_get_folio(swap);
2316	if (!folio) {
2317		if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) {
2318			/* Direct swapin skipping swap cache & readahead */
2319			folio = shmem_swap_alloc_folio(inode, vma, index,
2320						       index_entry, order, gfp);
2321			if (IS_ERR(folio)) {
2322				error = PTR_ERR(folio);
2323				folio = NULL;
2324				goto failed;
2325			}
2326			skip_swapcache = true;
2327		} else {
2328			/* Cached swapin only supports order 0 folio */
2329			folio = shmem_swapin_cluster(swap, gfp, info, index);
2330			if (!folio) {
2331				error = -ENOMEM;
2332				goto failed;
2333			}
2334		}
2335		if (fault_type) {
2336			*fault_type |= VM_FAULT_MAJOR;
2337			count_vm_event(PGMAJFAULT);
2338			count_memcg_event_mm(fault_mm, PGMAJFAULT);
2339		}
2340	} else {
2341		swap_update_readahead(folio, NULL, 0);
2342	}
2343
2344	if (order > folio_order(folio)) {
2345		/*
2346		 * Swapin may get smaller folios due to various reasons:
2347		 * It may fallback to order 0 due to memory pressure or race,
2348		 * swap readahead may swap in order 0 folios into swapcache
2349		 * asynchronously, while the shmem mapping can still stores
2350		 * large swap entries. In such cases, we should split the
2351		 * large swap entry to prevent possible data corruption.
2352		 */
2353		error = shmem_split_large_entry(inode, index, index_entry, gfp);
2354		if (error)
2355			goto failed_nolock;
2356	}
2357
2358	/*
2359	 * If the folio is large, round down swap and index by folio size.
2360	 * No matter what race occurs, the swap layer ensures we either get
2361	 * a valid folio that has its swap entry aligned by size, or a
2362	 * temporarily invalid one which we'll abort very soon and retry.
2363	 *
2364	 * shmem_add_to_page_cache ensures the whole range contains expected
2365	 * entries and prevents any corruption, so any race split is fine
2366	 * too, it will succeed as long as the entries are still there.
2367	 */
2368	nr_pages = folio_nr_pages(folio);
2369	if (nr_pages > 1) {
2370		swap.val = round_down(swap.val, nr_pages);
2371		index = round_down(index, nr_pages);
2372	}
2373
2374	/*
2375	 * We have to do this with the folio locked to prevent races.
2376	 * The shmem_confirm_swap below only checks if the first swap
2377	 * entry matches the folio, that's enough to ensure the folio
2378	 * is not used outside of shmem, as shmem swap entries
2379	 * and swap cache folios are never partially freed.
2380	 */
2381	folio_lock(folio);
2382	if ((!skip_swapcache && !folio_test_swapcache(folio)) ||
2383	    shmem_confirm_swap(mapping, index, swap) < 0 ||
2384	    folio->swap.val != swap.val) {
2385		error = -EEXIST;
2386		goto unlock;
2387	}
2388	if (!folio_test_uptodate(folio)) {
2389		error = -EIO;
2390		goto failed;
2391	}
2392	folio_wait_writeback(folio);
2393
2394	/*
2395	 * Some architectures may have to restore extra metadata to the
2396	 * folio after reading from swap.
2397	 */
2398	arch_swap_restore(folio_swap(swap, folio), folio);
2399
2400	if (shmem_should_replace_folio(folio, gfp)) {
2401		error = shmem_replace_folio(&folio, gfp, info, index, vma);
2402		if (error)
2403			goto failed;
2404	}
2405
2406	error = shmem_add_to_page_cache(folio, mapping, index,
2407					swp_to_radix_entry(swap), gfp);
2408	if (error)
2409		goto failed;
2410
2411	shmem_recalc_inode(inode, 0, -nr_pages);
2412
2413	if (sgp == SGP_WRITE)
2414		folio_mark_accessed(folio);
2415
2416	if (skip_swapcache) {
2417		folio->swap.val = 0;
2418		swapcache_clear(si, swap, nr_pages);
2419	} else {
2420		swap_cache_del_folio(folio);
2421	}
2422	folio_mark_dirty(folio);
2423	swap_free_nr(swap, nr_pages);
2424	put_swap_device(si);
2425
2426	*foliop = folio;
2427	return 0;
2428failed:
2429	if (shmem_confirm_swap(mapping, index, swap) < 0)
2430		error = -EEXIST;
2431	if (error == -EIO)
2432		shmem_set_folio_swapin_error(inode, index, folio, swap,
2433					     skip_swapcache);
2434unlock:
2435	if (folio)
2436		folio_unlock(folio);
2437failed_nolock:
2438	if (skip_swapcache)
2439		swapcache_clear(si, folio->swap, folio_nr_pages(folio));
2440	if (folio)
2441		folio_put(folio);
2442	put_swap_device(si);
2443
2444	return error;
2445}
2446
2447/*
2448 * shmem_get_folio_gfp - find page in cache, or get from swap, or allocate
2449 *
2450 * If we allocate a new one we do not mark it dirty. That's up to the
2451 * vm. If we swap it in we mark it dirty since we also free the swap
2452 * entry since a page cannot live in both the swap and page cache.
2453 *
2454 * vmf and fault_type are only supplied by shmem_fault: otherwise they are NULL.
2455 */
2456static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index,
2457		loff_t write_end, struct folio **foliop, enum sgp_type sgp,
2458		gfp_t gfp, struct vm_fault *vmf, vm_fault_t *fault_type)
2459{
2460	struct vm_area_struct *vma = vmf ? vmf->vma : NULL;
2461	struct mm_struct *fault_mm;
2462	struct folio *folio;
2463	int error;
2464	bool alloced;
2465	unsigned long orders = 0;
2466
2467	if (WARN_ON_ONCE(!shmem_mapping(inode->i_mapping)))
2468		return -EINVAL;
2469
2470	if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT))
2471		return -EFBIG;
2472repeat:
2473	if (sgp <= SGP_CACHE &&
2474	    ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode))
2475		return -EINVAL;
2476
2477	alloced = false;
2478	fault_mm = vma ? vma->vm_mm : NULL;
2479
2480	folio = filemap_get_entry(inode->i_mapping, index);
2481	if (folio && vma && userfaultfd_minor(vma)) {
2482		if (!xa_is_value(folio))
2483			folio_put(folio);
2484		*fault_type = handle_userfault(vmf, VM_UFFD_MINOR);
2485		return 0;
2486	}
2487
2488	if (xa_is_value(folio)) {
2489		error = shmem_swapin_folio(inode, index, &folio,
2490					   sgp, gfp, vma, fault_type);
2491		if (error == -EEXIST)
2492			goto repeat;
2493
2494		*foliop = folio;
2495		return error;
2496	}
2497
2498	if (folio) {
2499		folio_lock(folio);
2500
2501		/* Has the folio been truncated or swapped out? */
2502		if (unlikely(folio->mapping != inode->i_mapping)) {
2503			folio_unlock(folio);
2504			folio_put(folio);
2505			goto repeat;
2506		}
2507		if (sgp == SGP_WRITE)
2508			folio_mark_accessed(folio);
2509		if (folio_test_uptodate(folio))
2510			goto out;
2511		/* fallocated folio */
2512		if (sgp != SGP_READ)
2513			goto clear;
2514		folio_unlock(folio);
2515		folio_put(folio);
2516	}
2517
2518	/*
2519	 * SGP_READ: succeed on hole, with NULL folio, letting caller zero.
2520	 * SGP_NOALLOC: fail on hole, with NULL folio, letting caller fail.
2521	 */
2522	*foliop = NULL;
2523	if (sgp == SGP_READ)
2524		return 0;
2525	if (sgp == SGP_NOALLOC)
2526		return -ENOENT;
2527
2528	/*
2529	 * Fast cache lookup and swap lookup did not find it: allocate.
2530	 */
2531
2532	if (vma && userfaultfd_missing(vma)) {
2533		*fault_type = handle_userfault(vmf, VM_UFFD_MISSING);
2534		return 0;
2535	}
2536
2537	/* Find hugepage orders that are allowed for anonymous shmem and tmpfs. */
2538	orders = shmem_allowable_huge_orders(inode, vma, index, write_end, false);
2539	if (orders > 0) {
2540		gfp_t huge_gfp;
2541
2542		huge_gfp = vma_thp_gfp_mask(vma);
2543		huge_gfp = limit_gfp_mask(huge_gfp, gfp);
2544		folio = shmem_alloc_and_add_folio(vmf, huge_gfp,
2545				inode, index, fault_mm, orders);
2546		if (!IS_ERR(folio)) {
2547			if (folio_test_pmd_mappable(folio))
2548				count_vm_event(THP_FILE_ALLOC);
2549			count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_ALLOC);
2550			goto alloced;
2551		}
2552		if (PTR_ERR(folio) == -EEXIST)
2553			goto repeat;
2554	}
2555
2556	folio = shmem_alloc_and_add_folio(vmf, gfp, inode, index, fault_mm, 0);
2557	if (IS_ERR(folio)) {
2558		error = PTR_ERR(folio);
2559		if (error == -EEXIST)
2560			goto repeat;
2561		folio = NULL;
2562		goto unlock;
2563	}
2564
2565alloced:
2566	alloced = true;
2567	if (folio_test_large(folio) &&
2568	    DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) <
2569					folio_next_index(folio)) {
2570		struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
2571		struct shmem_inode_info *info = SHMEM_I(inode);
2572		/*
2573		 * Part of the large folio is beyond i_size: subject
2574		 * to shrink under memory pressure.
2575		 */
2576		spin_lock(&sbinfo->shrinklist_lock);
2577		/*
2578		 * _careful to defend against unlocked access to
2579		 * ->shrink_list in shmem_unused_huge_shrink()
2580		 */
2581		if (list_empty_careful(&info->shrinklist)) {
2582			list_add_tail(&info->shrinklist,
2583				      &sbinfo->shrinklist);
2584			sbinfo->shrinklist_len++;
2585		}
2586		spin_unlock(&sbinfo->shrinklist_lock);
2587	}
2588
2589	if (sgp == SGP_WRITE)
2590		folio_set_referenced(folio);
2591	/*
2592	 * Let SGP_FALLOC use the SGP_WRITE optimization on a new folio.
2593	 */
2594	if (sgp == SGP_FALLOC)
2595		sgp = SGP_WRITE;
2596clear:
2597	/*
2598	 * Let SGP_WRITE caller clear ends if write does not fill folio;
2599	 * but SGP_FALLOC on a folio fallocated earlier must initialize
2600	 * it now, lest undo on failure cancel our earlier guarantee.
2601	 */
2602	if (sgp != SGP_WRITE && !folio_test_uptodate(folio)) {
2603		long i, n = folio_nr_pages(folio);
2604
2605		for (i = 0; i < n; i++)
2606			clear_highpage(folio_page(folio, i));
2607		flush_dcache_folio(folio);
2608		folio_mark_uptodate(folio);
2609	}
2610
2611	/* Perhaps the file has been truncated since we checked */
2612	if (sgp <= SGP_CACHE &&
2613	    ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
2614		error = -EINVAL;
2615		goto unlock;
2616	}
2617out:
2618	*foliop = folio;
2619	return 0;
2620
2621	/*
2622	 * Error recovery.
2623	 */
2624unlock:
2625	if (alloced)
2626		filemap_remove_folio(folio);
2627	shmem_recalc_inode(inode, 0, 0);
2628	if (folio) {
2629		folio_unlock(folio);
2630		folio_put(folio);
2631	}
2632	return error;
2633}
2634
2635/**
2636 * shmem_get_folio - find, and lock a shmem folio.
2637 * @inode:	inode to search
2638 * @index:	the page index.
2639 * @write_end:	end of a write, could extend inode size
2640 * @foliop:	pointer to the folio if found
2641 * @sgp:	SGP_* flags to control behavior
2642 *
2643 * Looks up the page cache entry at @inode & @index.  If a folio is
2644 * present, it is returned locked with an increased refcount.
2645 *
2646 * If the caller modifies data in the folio, it must call folio_mark_dirty()
2647 * before unlocking the folio to ensure that the folio is not reclaimed.
2648 * There is no need to reserve space before calling folio_mark_dirty().
2649 *
2650 * When no folio is found, the behavior depends on @sgp:
2651 *  - for SGP_READ, *@foliop is %NULL and 0 is returned
2652 *  - for SGP_NOALLOC, *@foliop is %NULL and -ENOENT is returned
2653 *  - for all other flags a new folio is allocated, inserted into the
2654 *    page cache and returned locked in @foliop.
2655 *
2656 * Context: May sleep.
2657 * Return: 0 if successful, else a negative error code.
2658 */
2659int shmem_get_folio(struct inode *inode, pgoff_t index, loff_t write_end,
2660		    struct folio **foliop, enum sgp_type sgp)
2661{
2662	return shmem_get_folio_gfp(inode, index, write_end, foliop, sgp,
2663			mapping_gfp_mask(inode->i_mapping), NULL, NULL);
2664}
2665EXPORT_SYMBOL_GPL(shmem_get_folio);
2666
2667/*
2668 * This is like autoremove_wake_function, but it removes the wait queue
2669 * entry unconditionally - even if something else had already woken the
2670 * target.
2671 */
2672static int synchronous_wake_function(wait_queue_entry_t *wait,
2673			unsigned int mode, int sync, void *key)
2674{
2675	int ret = default_wake_function(wait, mode, sync, key);
2676	list_del_init(&wait->entry);
2677	return ret;
2678}
2679
2680/*
2681 * Trinity finds that probing a hole which tmpfs is punching can
2682 * prevent the hole-punch from ever completing: which in turn
2683 * locks writers out with its hold on i_rwsem.  So refrain from
2684 * faulting pages into the hole while it's being punched.  Although
2685 * shmem_undo_range() does remove the additions, it may be unable to
2686 * keep up, as each new page needs its own unmap_mapping_range() call,
2687 * and the i_mmap tree grows ever slower to scan if new vmas are added.
2688 *
2689 * It does not matter if we sometimes reach this check just before the
2690 * hole-punch begins, so that one fault then races with the punch:
2691 * we just need to make racing faults a rare case.
2692 *
2693 * The implementation below would be much simpler if we just used a
2694 * standard mutex or completion: but we cannot take i_rwsem in fault,
2695 * and bloating every shmem inode for this unlikely case would be sad.
2696 */
2697static vm_fault_t shmem_falloc_wait(struct vm_fault *vmf, struct inode *inode)
2698{
2699	struct shmem_falloc *shmem_falloc;
2700	struct file *fpin = NULL;
2701	vm_fault_t ret = 0;
2702
2703	spin_lock(&inode->i_lock);
2704	shmem_falloc = inode->i_private;
2705	if (shmem_falloc &&
2706	    shmem_falloc->waitq &&
2707	    vmf->pgoff >= shmem_falloc->start &&
2708	    vmf->pgoff < shmem_falloc->next) {
2709		wait_queue_head_t *shmem_falloc_waitq;
2710		DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function);
2711
2712		ret = VM_FAULT_NOPAGE;
2713		fpin = maybe_unlock_mmap_for_io(vmf, NULL);
2714		shmem_falloc_waitq = shmem_falloc->waitq;
2715		prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait,
2716				TASK_UNINTERRUPTIBLE);
2717		spin_unlock(&inode->i_lock);
2718		schedule();
2719
2720		/*
2721		 * shmem_falloc_waitq points into the shmem_fallocate()
2722		 * stack of the hole-punching task: shmem_falloc_waitq
2723		 * is usually invalid by the time we reach here, but
2724		 * finish_wait() does not dereference it in that case;
2725		 * though i_lock needed lest racing with wake_up_all().
2726		 */
2727		spin_lock(&inode->i_lock);
2728		finish_wait(shmem_falloc_waitq, &shmem_fault_wait);
2729	}
2730	spin_unlock(&inode->i_lock);
2731	if (fpin) {
2732		fput(fpin);
2733		ret = VM_FAULT_RETRY;
2734	}
2735	return ret;
2736}
2737
2738static vm_fault_t shmem_fault(struct vm_fault *vmf)
2739{
2740	struct inode *inode = file_inode(vmf->vma->vm_file);
2741	gfp_t gfp = mapping_gfp_mask(inode->i_mapping);
2742	struct folio *folio = NULL;
2743	vm_fault_t ret = 0;
2744	int err;
2745
2746	/*
2747	 * Trinity finds that probing a hole which tmpfs is punching can
2748	 * prevent the hole-punch from ever completing: noted in i_private.
2749	 */
2750	if (unlikely(inode->i_private)) {
2751		ret = shmem_falloc_wait(vmf, inode);
2752		if (ret)
2753			return ret;
2754	}
2755
2756	WARN_ON_ONCE(vmf->page != NULL);
2757	err = shmem_get_folio_gfp(inode, vmf->pgoff, 0, &folio, SGP_CACHE,
2758				  gfp, vmf, &ret);
2759	if (err)
2760		return vmf_error(err);
2761	if (folio) {
2762		vmf->page = folio_file_page(folio, vmf->pgoff);
2763		ret |= VM_FAULT_LOCKED;
2764	}
2765	return ret;
2766}
2767
2768unsigned long shmem_get_unmapped_area(struct file *file,
2769				      unsigned long uaddr, unsigned long len,
2770				      unsigned long pgoff, unsigned long flags)
2771{
2772	unsigned long addr;
2773	unsigned long offset;
2774	unsigned long inflated_len;
2775	unsigned long inflated_addr;
2776	unsigned long inflated_offset;
2777	unsigned long hpage_size;
2778
2779	if (len > TASK_SIZE)
2780		return -ENOMEM;
2781
2782	addr = mm_get_unmapped_area(file, uaddr, len, pgoff, flags);
2783
2784	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
2785		return addr;
2786	if (IS_ERR_VALUE(addr))
2787		return addr;
2788	if (addr & ~PAGE_MASK)
2789		return addr;
2790	if (addr > TASK_SIZE - len)
2791		return addr;
2792
2793	if (shmem_huge == SHMEM_HUGE_DENY)
2794		return addr;
2795	if (flags & MAP_FIXED)
2796		return addr;
2797	/*
2798	 * Our priority is to support MAP_SHARED mapped hugely;
2799	 * and support MAP_PRIVATE mapped hugely too, until it is COWed.
2800	 * But if caller specified an address hint and we allocated area there
2801	 * successfully, respect that as before.
2802	 */
2803	if (uaddr == addr)
2804		return addr;
2805
2806	hpage_size = HPAGE_PMD_SIZE;
2807	if (shmem_huge != SHMEM_HUGE_FORCE) {
2808		struct super_block *sb;
2809		unsigned long __maybe_unused hpage_orders;
2810		int order = 0;
2811
2812		if (file) {
2813			VM_BUG_ON(file->f_op != &shmem_file_operations);
2814			sb = file_inode(file)->i_sb;
2815		} else {
2816			/*
2817			 * Called directly from mm/mmap.c, or drivers/char/mem.c
2818			 * for "/dev/zero", to create a shared anonymous object.
2819			 */
2820			if (IS_ERR(shm_mnt))
2821				return addr;
2822			sb = shm_mnt->mnt_sb;
2823
2824			/*
2825			 * Find the highest mTHP order used for anonymous shmem to
2826			 * provide a suitable alignment address.
2827			 */
2828#ifdef CONFIG_TRANSPARENT_HUGEPAGE
2829			hpage_orders = READ_ONCE(huge_shmem_orders_always);
2830			hpage_orders |= READ_ONCE(huge_shmem_orders_within_size);
2831			hpage_orders |= READ_ONCE(huge_shmem_orders_madvise);
2832			if (SHMEM_SB(sb)->huge != SHMEM_HUGE_NEVER)
2833				hpage_orders |= READ_ONCE(huge_shmem_orders_inherit);
2834
2835			if (hpage_orders > 0) {
2836				order = highest_order(hpage_orders);
2837				hpage_size = PAGE_SIZE << order;
2838			}
2839#endif
2840		}
2841		if (SHMEM_SB(sb)->huge == SHMEM_HUGE_NEVER && !order)
2842			return addr;
2843	}
2844
2845	if (len < hpage_size)
2846		return addr;
2847
2848	offset = (pgoff << PAGE_SHIFT) & (hpage_size - 1);
2849	if (offset && offset + len < 2 * hpage_size)
2850		return addr;
2851	if ((addr & (hpage_size - 1)) == offset)
2852		return addr;
2853
2854	inflated_len = len + hpage_size - PAGE_SIZE;
2855	if (inflated_len > TASK_SIZE)
2856		return addr;
2857	if (inflated_len < len)
2858		return addr;
2859
2860	inflated_addr = mm_get_unmapped_area(NULL, uaddr, inflated_len, 0, flags);
2861	if (IS_ERR_VALUE(inflated_addr))
2862		return addr;
2863	if (inflated_addr & ~PAGE_MASK)
2864		return addr;
2865
2866	inflated_offset = inflated_addr & (hpage_size - 1);
2867	inflated_addr += offset - inflated_offset;
2868	if (inflated_offset > offset)
2869		inflated_addr += hpage_size;
2870
2871	if (inflated_addr > TASK_SIZE - len)
2872		return addr;
2873	return inflated_addr;
2874}
2875
2876#ifdef CONFIG_NUMA
2877static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
2878{
2879	struct inode *inode = file_inode(vma->vm_file);
2880	return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol);
2881}
2882
2883static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
2884					  unsigned long addr, pgoff_t *ilx)
2885{
2886	struct inode *inode = file_inode(vma->vm_file);
2887	pgoff_t index;
2888
2889	/*
2890	 * Bias interleave by inode number to distribute better across nodes;
2891	 * but this interface is independent of which page order is used, so
2892	 * supplies only that bias, letting caller apply the offset (adjusted
2893	 * by page order, as in shmem_get_pgoff_policy() and get_vma_policy()).
2894	 */
2895	*ilx = inode->i_ino;
2896	index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2897	return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index);
2898}
2899
2900static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info,
2901			pgoff_t index, unsigned int order, pgoff_t *ilx)
2902{
2903	struct mempolicy *mpol;
2904
2905	/* Bias interleave by inode number to distribute better across nodes */
2906	*ilx = info->vfs_inode.i_ino + (index >> order);
2907
2908	mpol = mpol_shared_policy_lookup(&info->policy, index);
2909	return mpol ? mpol : get_task_policy(current);
2910}
2911#else
2912static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info,
2913			pgoff_t index, unsigned int order, pgoff_t *ilx)
2914{
2915	*ilx = 0;
2916	return NULL;
2917}
2918#endif /* CONFIG_NUMA */
2919
2920int shmem_lock(struct file *file, int lock, struct ucounts *ucounts)
2921{
2922	struct inode *inode = file_inode(file);
2923	struct shmem_inode_info *info = SHMEM_I(inode);
2924	int retval = -ENOMEM;
2925
2926	/*
2927	 * What serializes the accesses to info->flags?
2928	 * ipc_lock_object() when called from shmctl_do_lock(),
2929	 * no serialization needed when called from shm_destroy().
2930	 */
2931	if (lock && !(info->flags & SHMEM_F_LOCKED)) {
2932		if (!user_shm_lock(inode->i_size, ucounts))
2933			goto out_nomem;
2934		info->flags |= SHMEM_F_LOCKED;
2935		mapping_set_unevictable(file->f_mapping);
2936	}
2937	if (!lock && (info->flags & SHMEM_F_LOCKED) && ucounts) {
2938		user_shm_unlock(inode->i_size, ucounts);
2939		info->flags &= ~SHMEM_F_LOCKED;
2940		mapping_clear_unevictable(file->f_mapping);
2941	}
2942	retval = 0;
2943
2944out_nomem:
2945	return retval;
2946}
2947
2948static int shmem_mmap_prepare(struct vm_area_desc *desc)
2949{
2950	struct file *file = desc->file;
2951	struct inode *inode = file_inode(file);
2952
2953	file_accessed(file);
2954	/* This is anonymous shared memory if it is unlinked at the time of mmap */
2955	if (inode->i_nlink)
2956		desc->vm_ops = &shmem_vm_ops;
2957	else
2958		desc->vm_ops = &shmem_anon_vm_ops;
2959	return 0;
2960}
2961
2962static int shmem_file_open(struct inode *inode, struct file *file)
2963{
2964	file->f_mode |= FMODE_CAN_ODIRECT;
2965	return generic_file_open(inode, file);
2966}
2967
2968#ifdef CONFIG_TMPFS_XATTR
2969static int shmem_initxattrs(struct inode *, const struct xattr *, void *);
2970
2971#if IS_ENABLED(CONFIG_UNICODE)
2972/*
2973 * shmem_inode_casefold_flags - Deal with casefold file attribute flag
2974 *
2975 * The casefold file attribute needs some special checks. I can just be added to
2976 * an empty dir, and can't be removed from a non-empty dir.
2977 */
2978static int shmem_inode_casefold_flags(struct inode *inode, unsigned int fsflags,
2979				      struct dentry *dentry, unsigned int *i_flags)
2980{
2981	unsigned int old = inode->i_flags;
2982	struct super_block *sb = inode->i_sb;
2983
2984	if (fsflags & FS_CASEFOLD_FL) {
2985		if (!(old & S_CASEFOLD)) {
2986			if (!sb->s_encoding)
2987				return -EOPNOTSUPP;
2988
2989			if (!S_ISDIR(inode->i_mode))
2990				return -ENOTDIR;
2991
2992			if (dentry && !simple_empty(dentry))
2993				return -ENOTEMPTY;
2994		}
2995
2996		*i_flags = *i_flags | S_CASEFOLD;
2997	} else if (old & S_CASEFOLD) {
2998		if (dentry && !simple_empty(dentry))
2999			return -ENOTEMPTY;
3000	}
3001
3002	return 0;
3003}
3004#else
3005static int shmem_inode_casefold_flags(struct inode *inode, unsigned int fsflags,
3006				      struct dentry *dentry, unsigned int *i_flags)
3007{
3008	if (fsflags & FS_CASEFOLD_FL)
3009		return -EOPNOTSUPP;
3010
3011	return 0;
3012}
3013#endif
3014
3015/*
3016 * chattr's fsflags are unrelated to extended attributes,
3017 * but tmpfs has chosen to enable them under the same config option.
3018 */
3019static int shmem_set_inode_flags(struct inode *inode, unsigned int fsflags, struct dentry *dentry)
3020{
3021	unsigned int i_flags = 0;
3022	int ret;
3023
3024	ret = shmem_inode_casefold_flags(inode, fsflags, dentry, &i_flags);
3025	if (ret)
3026		return ret;
3027
3028	if (fsflags & FS_NOATIME_FL)
3029		i_flags |= S_NOATIME;
3030	if (fsflags & FS_APPEND_FL)
3031		i_flags |= S_APPEND;
3032	if (fsflags & FS_IMMUTABLE_FL)
3033		i_flags |= S_IMMUTABLE;
3034	/*
3035	 * But FS_NODUMP_FL does not require any action in i_flags.
3036	 */
3037	inode_set_flags(inode, i_flags, S_NOATIME | S_APPEND | S_IMMUTABLE | S_CASEFOLD);
3038
3039	return 0;
3040}
3041#else
3042static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags, struct dentry *dentry)
3043{
3044}
3045#define shmem_initxattrs NULL
3046#endif
3047
3048static struct offset_ctx *shmem_get_offset_ctx(struct inode *inode)
3049{
3050	return &SHMEM_I(inode)->dir_offsets;
3051}
3052
3053static struct inode *__shmem_get_inode(struct mnt_idmap *idmap,
3054					     struct super_block *sb,
3055					     struct inode *dir, umode_t mode,
3056					     dev_t dev, unsigned long flags)
3057{
3058	struct inode *inode;
3059	struct shmem_inode_info *info;
3060	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
3061	ino_t ino;
3062	int err;
3063
3064	err = shmem_reserve_inode(sb, &ino);
3065	if (err)
3066		return ERR_PTR(err);
3067
3068	inode = new_inode(sb);
3069	if (!inode) {
3070		shmem_free_inode(sb, 0);
3071		return ERR_PTR(-ENOSPC);
3072	}
3073
3074	inode->i_ino = ino;
3075	inode_init_owner(idmap, inode, dir, mode);
3076	inode->i_blocks = 0;
3077	simple_inode_init_ts(inode);
3078	inode->i_generation = get_random_u32();
3079	info = SHMEM_I(inode);
3080	memset(info, 0, (char *)inode - (char *)info);
3081	spin_lock_init(&info->lock);
3082	atomic_set(&info->stop_eviction, 0);
3083	info->seals = F_SEAL_SEAL;
3084	info->flags = (flags & VM_NORESERVE) ? SHMEM_F_NORESERVE : 0;
3085	info->i_crtime = inode_get_mtime(inode);
3086	info->fsflags = (dir == NULL) ? 0 :
3087		SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED;
3088	if (info->fsflags)
3089		shmem_set_inode_flags(inode, info->fsflags, NULL);
3090	INIT_LIST_HEAD(&info->shrinklist);
3091	INIT_LIST_HEAD(&info->swaplist);
3092	simple_xattrs_init(&info->xattrs);
3093	cache_no_acl(inode);
3094	if (sbinfo->noswap)
3095		mapping_set_unevictable(inode->i_mapping);
3096
3097	/* Don't consider 'deny' for emergencies and 'force' for testing */
3098	if (sbinfo->huge)
3099		mapping_set_large_folios(inode->i_mapping);
3100
3101	switch (mode & S_IFMT) {
3102	default:
3103		inode->i_op = &shmem_special_inode_operations;
3104		init_special_inode(inode, mode, dev);
3105		break;
3106	case S_IFREG:
3107		inode->i_mapping->a_ops = &shmem_aops;
3108		inode->i_op = &shmem_inode_operations;
3109		inode->i_fop = &shmem_file_operations;
3110		mpol_shared_policy_init(&info->policy,
3111					 shmem_get_sbmpol(sbinfo));
3112		break;
3113	case S_IFDIR:
3114		inc_nlink(inode);
3115		/* Some things misbehave if size == 0 on a directory */
3116		inode->i_size = 2 * BOGO_DIRENT_SIZE;
3117		inode->i_op = &shmem_dir_inode_operations;
3118		inode->i_fop = &simple_offset_dir_operations;
3119		simple_offset_init(shmem_get_offset_ctx(inode));
3120		break;
3121	case S_IFLNK:
3122		/*
3123		 * Must not load anything in the rbtree,
3124		 * mpol_free_shared_policy will not be called.
3125		 */
3126		mpol_shared_policy_init(&info->policy, NULL);
3127		break;
3128	}
3129
3130	lockdep_annotate_inode_mutex_key(inode);
3131	return inode;
3132}
3133
3134#ifdef CONFIG_TMPFS_QUOTA
3135static struct inode *shmem_get_inode(struct mnt_idmap *idmap,
3136				     struct super_block *sb, struct inode *dir,
3137				     umode_t mode, dev_t dev, unsigned long flags)
3138{
3139	int err;
3140	struct inode *inode;
3141
3142	inode = __shmem_get_inode(idmap, sb, dir, mode, dev, flags);
3143	if (IS_ERR(inode))
3144		return inode;
3145
3146	err = dquot_initialize(inode);
3147	if (err)
3148		goto errout;
3149
3150	err = dquot_alloc_inode(inode);
3151	if (err) {
3152		dquot_drop(inode);
3153		goto errout;
3154	}
3155	return inode;
3156
3157errout:
3158	inode->i_flags |= S_NOQUOTA;
3159	iput(inode);
3160	return ERR_PTR(err);
3161}
3162#else
3163static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap,
3164				     struct super_block *sb, struct inode *dir,
3165				     umode_t mode, dev_t dev, unsigned long flags)
3166{
3167	return __shmem_get_inode(idmap, sb, dir, mode, dev, flags);
3168}
3169#endif /* CONFIG_TMPFS_QUOTA */
3170
3171#ifdef CONFIG_USERFAULTFD
3172int shmem_mfill_atomic_pte(pmd_t *dst_pmd,
3173			   struct vm_area_struct *dst_vma,
3174			   unsigned long dst_addr,
3175			   unsigned long src_addr,
3176			   uffd_flags_t flags,
3177			   struct folio **foliop)
3178{
3179	struct inode *inode = file_inode(dst_vma->vm_file);
3180	struct shmem_inode_info *info = SHMEM_I(inode);
3181	struct address_space *mapping = inode->i_mapping;
3182	gfp_t gfp = mapping_gfp_mask(mapping);
3183	pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
3184	void *page_kaddr;
3185	struct folio *folio;
3186	int ret;
3187	pgoff_t max_off;
3188
3189	if (shmem_inode_acct_blocks(inode, 1)) {
3190		/*
3191		 * We may have got a page, returned -ENOENT triggering a retry,
3192		 * and now we find ourselves with -ENOMEM. Release the page, to
3193		 * avoid a BUG_ON in our caller.
3194		 */
3195		if (unlikely(*foliop)) {
3196			folio_put(*foliop);
3197			*foliop = NULL;
3198		}
3199		return -ENOMEM;
3200	}
3201
3202	if (!*foliop) {
3203		ret = -ENOMEM;
3204		folio = shmem_alloc_folio(gfp, 0, info, pgoff);
3205		if (!folio)
3206			goto out_unacct_blocks;
3207
3208		if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) {
3209			page_kaddr = kmap_local_folio(folio, 0);
3210			/*
3211			 * The read mmap_lock is held here.  Despite the
3212			 * mmap_lock being read recursive a deadlock is still
3213			 * possible if a writer has taken a lock.  For example:
3214			 *
3215			 * process A thread 1 takes read lock on own mmap_lock
3216			 * process A thread 2 calls mmap, blocks taking write lock
3217			 * process B thread 1 takes page fault, read lock on own mmap lock
3218			 * process B thread 2 calls mmap, blocks taking write lock
3219			 * process A thread 1 blocks taking read lock on process B
3220			 * process B thread 1 blocks taking read lock on process A
3221			 *
3222			 * Disable page faults to prevent potential deadlock
3223			 * and retry the copy outside the mmap_lock.
3224			 */
3225			pagefault_disable();
3226			ret = copy_from_user(page_kaddr,
3227					     (const void __user *)src_addr,
3228					     PAGE_SIZE);
3229			pagefault_enable();
3230			kunmap_local(page_kaddr);
3231
3232			/* fallback to copy_from_user outside mmap_lock */
3233			if (unlikely(ret)) {
3234				*foliop = folio;
3235				ret = -ENOENT;
3236				/* don't free the page */
3237				goto out_unacct_blocks;
3238			}
3239
3240			flush_dcache_folio(folio);
3241		} else {		/* ZEROPAGE */
3242			clear_user_highpage(&folio->page, dst_addr);
3243		}
3244	} else {
3245		folio = *foliop;
3246		VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
3247		*foliop = NULL;
3248	}
3249
3250	VM_BUG_ON(folio_test_locked(folio));
3251	VM_BUG_ON(folio_test_swapbacked(folio));
3252	__folio_set_locked(folio);
3253	__folio_set_swapbacked(folio);
3254	__folio_mark_uptodate(folio);
3255
3256	ret = -EFAULT;
3257	max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
3258	if (unlikely(pgoff >= max_off))
3259		goto out_release;
3260
3261	ret = mem_cgroup_charge(folio, dst_vma->vm_mm, gfp);
3262	if (ret)
3263		goto out_release;
3264	ret = shmem_add_to_page_cache(folio, mapping, pgoff, NULL, gfp);
3265	if (ret)
3266		goto out_release;
3267
3268	ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
3269				       &folio->page, true, flags);
3270	if (ret)
3271		goto out_delete_from_cache;
3272
3273	shmem_recalc_inode(inode, 1, 0);
3274	folio_unlock(folio);
3275	return 0;
3276out_delete_from_cache:
3277	filemap_remove_folio(folio);
3278out_release:
3279	folio_unlock(folio);
3280	folio_put(folio);
3281out_unacct_blocks:
3282	shmem_inode_unacct_blocks(inode, 1);
3283	return ret;
3284}
3285#endif /* CONFIG_USERFAULTFD */
3286
3287#ifdef CONFIG_TMPFS
3288static const struct inode_operations shmem_symlink_inode_operations;
3289static const struct inode_operations shmem_short_symlink_operations;
3290
3291static int
3292shmem_write_begin(const struct kiocb *iocb, struct address_space *mapping,
3293		  loff_t pos, unsigned len,
3294		  struct folio **foliop, void **fsdata)
3295{
3296	struct inode *inode = mapping->host;
3297	struct shmem_inode_info *info = SHMEM_I(inode);
3298	pgoff_t index = pos >> PAGE_SHIFT;
3299	struct folio *folio;
3300	int ret = 0;
3301
3302	/* i_rwsem is held by caller */
3303	if (unlikely(info->seals & (F_SEAL_GROW |
3304				   F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))) {
3305		if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))
3306			return -EPERM;
3307		if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size)
3308			return -EPERM;
3309	}
3310
3311	if (unlikely((info->flags & SHMEM_F_MAPPING_FROZEN) &&
3312		     pos + len > inode->i_size))
3313		return -EPERM;
3314
3315	ret = shmem_get_folio(inode, index, pos + len, &folio, SGP_WRITE);
3316	if (ret)
3317		return ret;
3318
3319	if (folio_contain_hwpoisoned_page(folio)) {
3320		folio_unlock(folio);
3321		folio_put(folio);
3322		return -EIO;
3323	}
3324
3325	*foliop = folio;
3326	return 0;
3327}
3328
3329static int
3330shmem_write_end(const struct kiocb *iocb, struct address_space *mapping,
3331		loff_t pos, unsigned len, unsigned copied,
3332		struct folio *folio, void *fsdata)
3333{
3334	struct inode *inode = mapping->host;
3335
3336	if (pos + copied > inode->i_size)
3337		i_size_write(inode, pos + copied);
3338
3339	if (!folio_test_uptodate(folio)) {
3340		if (copied < folio_size(folio)) {
3341			size_t from = offset_in_folio(folio, pos);
3342			folio_zero_segments(folio, 0, from,
3343					from + copied, folio_size(folio));
3344		}
3345		folio_mark_uptodate(folio);
3346	}
3347	folio_mark_dirty(folio);
3348	folio_unlock(folio);
3349	folio_put(folio);
3350
3351	return copied;
3352}
3353
3354static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
3355{
3356	struct file *file = iocb->ki_filp;
3357	struct inode *inode = file_inode(file);
3358	struct address_space *mapping = inode->i_mapping;
3359	pgoff_t index;
3360	unsigned long offset;
3361	int error = 0;
3362	ssize_t retval = 0;
3363
3364	for (;;) {
3365		struct folio *folio = NULL;
3366		struct page *page = NULL;
3367		unsigned long nr, ret;
3368		loff_t end_offset, i_size = i_size_read(inode);
3369		bool fallback_page_copy = false;
3370		size_t fsize;
3371
3372		if (unlikely(iocb->ki_pos >= i_size))
3373			break;
3374
3375		index = iocb->ki_pos >> PAGE_SHIFT;
3376		error = shmem_get_folio(inode, index, 0, &folio, SGP_READ);
3377		if (error) {
3378			if (error == -EINVAL)
3379				error = 0;
3380			break;
3381		}
3382		if (folio) {
3383			folio_unlock(folio);
3384
3385			page = folio_file_page(folio, index);
3386			if (PageHWPoison(page)) {
3387				folio_put(folio);
3388				error = -EIO;
3389				break;
3390			}
3391
3392			if (folio_test_large(folio) &&
3393			    folio_test_has_hwpoisoned(folio))
3394				fallback_page_copy = true;
3395		}
3396
3397		/*
3398		 * We must evaluate after, since reads (unlike writes)
3399		 * are called without i_rwsem protection against truncate
3400		 */
3401		i_size = i_size_read(inode);
3402		if (unlikely(iocb->ki_pos >= i_size)) {
3403			if (folio)
3404				folio_put(folio);
3405			break;
3406		}
3407		end_offset = min_t(loff_t, i_size, iocb->ki_pos + to->count);
3408		if (folio && likely(!fallback_page_copy))
3409			fsize = folio_size(folio);
3410		else
3411			fsize = PAGE_SIZE;
3412		offset = iocb->ki_pos & (fsize - 1);
3413		nr = min_t(loff_t, end_offset - iocb->ki_pos, fsize - offset);
3414
3415		if (folio) {
3416			/*
3417			 * If users can be writing to this page using arbitrary
3418			 * virtual addresses, take care about potential aliasing
3419			 * before reading the page on the kernel side.
3420			 */
3421			if (mapping_writably_mapped(mapping)) {
3422				if (likely(!fallback_page_copy))
3423					flush_dcache_folio(folio);
3424				else
3425					flush_dcache_page(page);
3426			}
3427
3428			/*
3429			 * Mark the folio accessed if we read the beginning.
3430			 */
3431			if (!offset)
3432				folio_mark_accessed(folio);
3433			/*
3434			 * Ok, we have the page, and it's up-to-date, so
3435			 * now we can copy it to user space...
3436			 */
3437			if (likely(!fallback_page_copy))
3438				ret = copy_folio_to_iter(folio, offset, nr, to);
3439			else
3440				ret = copy_page_to_iter(page, offset, nr, to);
3441			folio_put(folio);
3442		} else if (user_backed_iter(to)) {
3443			/*
3444			 * Copy to user tends to be so well optimized, but
3445			 * clear_user() not so much, that it is noticeably
3446			 * faster to copy the zero page instead of clearing.
3447			 */
3448			ret = copy_page_to_iter(ZERO_PAGE(0), offset, nr, to);
3449		} else {
3450			/*
3451			 * But submitting the same page twice in a row to
3452			 * splice() - or others? - can result in confusion:
3453			 * so don't attempt that optimization on pipes etc.
3454			 */
3455			ret = iov_iter_zero(nr, to);
3456		}
3457
3458		retval += ret;
3459		iocb->ki_pos += ret;
3460
3461		if (!iov_iter_count(to))
3462			break;
3463		if (ret < nr) {
3464			error = -EFAULT;
3465			break;
3466		}
3467		cond_resched();
3468	}
3469
3470	file_accessed(file);
3471	return retval ? retval : error;
3472}
3473
3474static ssize_t shmem_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
3475{
3476	struct file *file = iocb->ki_filp;
3477	struct inode *inode = file->f_mapping->host;
3478	ssize_t ret;
3479
3480	inode_lock(inode);
3481	ret = generic_write_checks(iocb, from);
3482	if (ret <= 0)
3483		goto unlock;
3484	ret = file_remove_privs(file);
3485	if (ret)
3486		goto unlock;
3487	ret = file_update_time(file);
3488	if (ret)
3489		goto unlock;
3490	ret = generic_perform_write(iocb, from);
3491unlock:
3492	inode_unlock(inode);
3493	return ret;
3494}
3495
3496static bool zero_pipe_buf_get(struct pipe_inode_info *pipe,
3497			      struct pipe_buffer *buf)
3498{
3499	return true;
3500}
3501
3502static void zero_pipe_buf_release(struct pipe_inode_info *pipe,
3503				  struct pipe_buffer *buf)
3504{
3505}
3506
3507static bool zero_pipe_buf_try_steal(struct pipe_inode_info *pipe,
3508				    struct pipe_buffer *buf)
3509{
3510	return false;
3511}
3512
3513static const struct pipe_buf_operations zero_pipe_buf_ops = {
3514	.release	= zero_pipe_buf_release,
3515	.try_steal	= zero_pipe_buf_try_steal,
3516	.get		= zero_pipe_buf_get,
3517};
3518
3519static size_t splice_zeropage_into_pipe(struct pipe_inode_info *pipe,
3520					loff_t fpos, size_t size)
3521{
3522	size_t offset = fpos & ~PAGE_MASK;
3523
3524	size = min_t(size_t, size, PAGE_SIZE - offset);
3525
3526	if (!pipe_is_full(pipe)) {
3527		struct pipe_buffer *buf = pipe_head_buf(pipe);
3528
3529		*buf = (struct pipe_buffer) {
3530			.ops	= &zero_pipe_buf_ops,
3531			.page	= ZERO_PAGE(0),
3532			.offset	= offset,
3533			.len	= size,
3534		};
3535		pipe->head++;
3536	}
3537
3538	return size;
3539}
3540
3541static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
3542				      struct pipe_inode_info *pipe,
3543				      size_t len, unsigned int flags)
3544{
3545	struct inode *inode = file_inode(in);
3546	struct address_space *mapping = inode->i_mapping;
3547	struct folio *folio = NULL;
3548	size_t total_spliced = 0, used, npages, n, part;
3549	loff_t isize;
3550	int error = 0;
3551
3552	/* Work out how much data we can actually add into the pipe */
3553	used = pipe_buf_usage(pipe);
3554	npages = max_t(ssize_t, pipe->max_usage - used, 0);
3555	len = min_t(size_t, len, npages * PAGE_SIZE);
3556
3557	do {
3558		bool fallback_page_splice = false;
3559		struct page *page = NULL;
3560		pgoff_t index;
3561		size_t size;
3562
3563		if (*ppos >= i_size_read(inode))
3564			break;
3565
3566		index = *ppos >> PAGE_SHIFT;
3567		error = shmem_get_folio(inode, index, 0, &folio, SGP_READ);
3568		if (error) {
3569			if (error == -EINVAL)
3570				error = 0;
3571			break;
3572		}
3573		if (folio) {
3574			folio_unlock(folio);
3575
3576			page = folio_file_page(folio, index);
3577			if (PageHWPoison(page)) {
3578				error = -EIO;
3579				break;
3580			}
3581
3582			if (folio_test_large(folio) &&
3583			    folio_test_has_hwpoisoned(folio))
3584				fallback_page_splice = true;
3585		}
3586
3587		/*
3588		 * i_size must be checked after we know the pages are Uptodate.
3589		 *
3590		 * Checking i_size after the check allows us to calculate
3591		 * the correct value for "nr", which means the zero-filled
3592		 * part of the page is not copied back to userspace (unless
3593		 * another truncate extends the file - this is desired though).
3594		 */
3595		isize = i_size_read(inode);
3596		if (unlikely(*ppos >= isize))
3597			break;
3598		/*
3599		 * Fallback to PAGE_SIZE splice if the large folio has hwpoisoned
3600		 * pages.
3601		 */
3602		size = len;
3603		if (unlikely(fallback_page_splice)) {
3604			size_t offset = *ppos & ~PAGE_MASK;
3605
3606			size = umin(size, PAGE_SIZE - offset);
3607		}
3608		part = min_t(loff_t, isize - *ppos, size);
3609
3610		if (folio) {
3611			/*
3612			 * If users can be writing to this page using arbitrary
3613			 * virtual addresses, take care about potential aliasing
3614			 * before reading the page on the kernel side.
3615			 */
3616			if (mapping_writably_mapped(mapping)) {
3617				if (likely(!fallback_page_splice))
3618					flush_dcache_folio(folio);
3619				else
3620					flush_dcache_page(page);
3621			}
3622			folio_mark_accessed(folio);
3623			/*
3624			 * Ok, we have the page, and it's up-to-date, so we can
3625			 * now splice it into the pipe.
3626			 */
3627			n = splice_folio_into_pipe(pipe, folio, *ppos, part);
3628			folio_put(folio);
3629			folio = NULL;
3630		} else {
3631			n = splice_zeropage_into_pipe(pipe, *ppos, part);
3632		}
3633
3634		if (!n)
3635			break;
3636		len -= n;
3637		total_spliced += n;
3638		*ppos += n;
3639		in->f_ra.prev_pos = *ppos;
3640		if (pipe_is_full(pipe))
3641			break;
3642
3643		cond_resched();
3644	} while (len);
3645
3646	if (folio)
3647		folio_put(folio);
3648
3649	file_accessed(in);
3650	return total_spliced ? total_spliced : error;
3651}
3652
3653static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
3654{
3655	struct address_space *mapping = file->f_mapping;
3656	struct inode *inode = mapping->host;
3657
3658	if (whence != SEEK_DATA && whence != SEEK_HOLE)
3659		return generic_file_llseek_size(file, offset, whence,
3660					MAX_LFS_FILESIZE, i_size_read(inode));
3661	if (offset < 0)
3662		return -ENXIO;
3663
3664	inode_lock(inode);
3665	/* We're holding i_rwsem so we can access i_size directly */
3666	offset = mapping_seek_hole_data(mapping, offset, inode->i_size, whence);
3667	if (offset >= 0)
3668		offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE);
3669	inode_unlock(inode);
3670	return offset;
3671}
3672
3673static long shmem_fallocate(struct file *file, int mode, loff_t offset,
3674							 loff_t len)
3675{
3676	struct inode *inode = file_inode(file);
3677	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
3678	struct shmem_inode_info *info = SHMEM_I(inode);
3679	struct shmem_falloc shmem_falloc;
3680	pgoff_t start, index, end, undo_fallocend;
3681	int error;
3682
3683	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
3684		return -EOPNOTSUPP;
3685
3686	inode_lock(inode);
3687
3688	if (info->flags & SHMEM_F_MAPPING_FROZEN) {
3689		error = -EPERM;
3690		goto out;
3691	}
3692
3693	if (mode & FALLOC_FL_PUNCH_HOLE) {
3694		struct address_space *mapping = file->f_mapping;
3695		loff_t unmap_start = round_up(offset, PAGE_SIZE);
3696		loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1;
3697		DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq);
3698
3699		/* protected by i_rwsem */
3700		if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
3701			error = -EPERM;
3702			goto out;
3703		}
3704
3705		shmem_falloc.waitq = &shmem_falloc_waitq;
3706		shmem_falloc.start = (u64)unmap_start >> PAGE_SHIFT;
3707		shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT;
3708		spin_lock(&inode->i_lock);
3709		inode->i_private = &shmem_falloc;
3710		spin_unlock(&inode->i_lock);
3711
3712		if ((u64)unmap_end > (u64)unmap_start)
3713			unmap_mapping_range(mapping, unmap_start,
3714					    1 + unmap_end - unmap_start, 0);
3715		shmem_truncate_range(inode, offset, offset + len - 1);
3716		/* No need to unmap again: hole-punching leaves COWed pages */
3717
3718		spin_lock(&inode->i_lock);
3719		inode->i_private = NULL;
3720		wake_up_all(&shmem_falloc_waitq);
3721		WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.head));
3722		spin_unlock(&inode->i_lock);
3723		error = 0;
3724		goto out;
3725	}
3726
3727	/* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
3728	error = inode_newsize_ok(inode, offset + len);
3729	if (error)
3730		goto out;
3731
3732	if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) {
3733		error = -EPERM;
3734		goto out;
3735	}
3736
3737	start = offset >> PAGE_SHIFT;
3738	end = (offset + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
3739	/* Try to avoid a swapstorm if len is impossible to satisfy */
3740	if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) {
3741		error = -ENOSPC;
3742		goto out;
3743	}
3744
3745	shmem_falloc.waitq = NULL;
3746	shmem_falloc.start = start;
3747	shmem_falloc.next  = start;
3748	shmem_falloc.nr_falloced = 0;
3749	shmem_falloc.nr_unswapped = 0;
3750	spin_lock(&inode->i_lock);
3751	inode->i_private = &shmem_falloc;
3752	spin_unlock(&inode->i_lock);
3753
3754	/*
3755	 * info->fallocend is only relevant when huge pages might be
3756	 * involved: to prevent split_huge_page() freeing fallocated
3757	 * pages when FALLOC_FL_KEEP_SIZE committed beyond i_size.
3758	 */
3759	undo_fallocend = info->fallocend;
3760	if (info->fallocend < end)
3761		info->fallocend = end;
3762
3763	for (index = start; index < end; ) {
3764		struct folio *folio;
3765
3766		/*
3767		 * Check for fatal signal so that we abort early in OOM
3768		 * situations. We don't want to abort in case of non-fatal
3769		 * signals as large fallocate can take noticeable time and
3770		 * e.g. periodic timers may result in fallocate constantly
3771		 * restarting.
3772		 */
3773		if (fatal_signal_pending(current))
3774			error = -EINTR;
3775		else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced)
3776			error = -ENOMEM;
3777		else
3778			error = shmem_get_folio(inode, index, offset + len,
3779						&folio, SGP_FALLOC);
3780		if (error) {
3781			info->fallocend = undo_fallocend;
3782			/* Remove the !uptodate folios we added */
3783			if (index > start) {
3784				shmem_undo_range(inode,
3785				    (loff_t)start << PAGE_SHIFT,
3786				    ((loff_t)index << PAGE_SHIFT) - 1, true);
3787			}
3788			goto undone;
3789		}
3790
3791		/*
3792		 * Here is a more important optimization than it appears:
3793		 * a second SGP_FALLOC on the same large folio will clear it,
3794		 * making it uptodate and un-undoable if we fail later.
3795		 */
3796		index = folio_next_index(folio);
3797		/* Beware 32-bit wraparound */
3798		if (!index)
3799			index--;
3800
3801		/*
3802		 * Inform shmem_writeout() how far we have reached.
3803		 * No need for lock or barrier: we have the page lock.
3804		 */
3805		if (!folio_test_uptodate(folio))
3806			shmem_falloc.nr_falloced += index - shmem_falloc.next;
3807		shmem_falloc.next = index;
3808
3809		/*
3810		 * If !uptodate, leave it that way so that freeable folios
3811		 * can be recognized if we need to rollback on error later.
3812		 * But mark it dirty so that memory pressure will swap rather
3813		 * than free the folios we are allocating (and SGP_CACHE folios
3814		 * might still be clean: we now need to mark those dirty too).
3815		 */
3816		folio_mark_dirty(folio);
3817		folio_unlock(folio);
3818		folio_put(folio);
3819		cond_resched();
3820	}
3821
3822	if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
3823		i_size_write(inode, offset + len);
3824undone:
3825	spin_lock(&inode->i_lock);
3826	inode->i_private = NULL;
3827	spin_unlock(&inode->i_lock);
3828out:
3829	if (!error)
3830		file_modified(file);
3831	inode_unlock(inode);
3832	return error;
3833}
3834
3835static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
3836{
3837	struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
3838
3839	buf->f_type = TMPFS_MAGIC;
3840	buf->f_bsize = PAGE_SIZE;
3841	buf->f_namelen = NAME_MAX;
3842	if (sbinfo->max_blocks) {
3843		buf->f_blocks = sbinfo->max_blocks;
3844		buf->f_bavail =
3845		buf->f_bfree  = sbinfo->max_blocks -
3846				percpu_counter_sum(&sbinfo->used_blocks);
3847	}
3848	if (sbinfo->max_inodes) {
3849		buf->f_files = sbinfo->max_inodes;
3850		buf->f_ffree = sbinfo->free_ispace / BOGO_INODE_SIZE;
3851	}
3852	/* else leave those fields 0 like simple_statfs */
3853
3854	buf->f_fsid = uuid_to_fsid(dentry->d_sb->s_uuid.b);
3855
3856	return 0;
3857}
3858
3859/*
3860 * File creation. Allocate an inode, and we're done..
3861 */
3862static int
3863shmem_mknod(struct mnt_idmap *idmap, struct inode *dir,
3864	    struct dentry *dentry, umode_t mode, dev_t dev)
3865{
3866	struct inode *inode;
3867	int error;
3868
3869	if (!generic_ci_validate_strict_name(dir, &dentry->d_name))
3870		return -EINVAL;
3871
3872	inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, dev, VM_NORESERVE);
3873	if (IS_ERR(inode))
3874		return PTR_ERR(inode);
3875
3876	error = simple_acl_create(dir, inode);
3877	if (error)
3878		goto out_iput;
3879	error = security_inode_init_security(inode, dir, &dentry->d_name,
3880					     shmem_initxattrs, NULL);
3881	if (error && error != -EOPNOTSUPP)
3882		goto out_iput;
3883
3884	error = simple_offset_add(shmem_get_offset_ctx(dir), dentry);
3885	if (error)
3886		goto out_iput;
3887
3888	dir->i_size += BOGO_DIRENT_SIZE;
3889	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
3890	inode_inc_iversion(dir);
3891
3892	d_make_persistent(dentry, inode);
3893	return error;
3894
3895out_iput:
3896	iput(inode);
3897	return error;
3898}
3899
3900static int
3901shmem_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
3902	      struct file *file, umode_t mode)
3903{
3904	struct inode *inode;
3905	int error;
3906
3907	inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, 0, VM_NORESERVE);
3908	if (IS_ERR(inode)) {
3909		error = PTR_ERR(inode);
3910		goto err_out;
3911	}
3912	error = security_inode_init_security(inode, dir, NULL,
3913					     shmem_initxattrs, NULL);
3914	if (error && error != -EOPNOTSUPP)
3915		goto out_iput;
3916	error = simple_acl_create(dir, inode);
3917	if (error)
3918		goto out_iput;
3919	d_tmpfile(file, inode);
3920
3921err_out:
3922	return finish_open_simple(file, error);
3923out_iput:
3924	iput(inode);
3925	return error;
3926}
3927
3928static struct dentry *shmem_mkdir(struct mnt_idmap *idmap, struct inode *dir,
3929				  struct dentry *dentry, umode_t mode)
3930{
3931	int error;
3932
3933	error = shmem_mknod(idmap, dir, dentry, mode | S_IFDIR, 0);
3934	if (error)
3935		return ERR_PTR(error);
3936	inc_nlink(dir);
3937	return NULL;
3938}
3939
3940static int shmem_create(struct mnt_idmap *idmap, struct inode *dir,
3941			struct dentry *dentry, umode_t mode, bool excl)
3942{
3943	return shmem_mknod(idmap, dir, dentry, mode | S_IFREG, 0);
3944}
3945
3946/*
3947 * Link a file..
3948 */
3949static int shmem_link(struct dentry *old_dentry, struct inode *dir,
3950		      struct dentry *dentry)
3951{
3952	struct inode *inode = d_inode(old_dentry);
3953	int ret;
3954
3955	/*
3956	 * No ordinary (disk based) filesystem counts links as inodes;
3957	 * but each new link needs a new dentry, pinning lowmem, and
3958	 * tmpfs dentries cannot be pruned until they are unlinked.
3959	 * But if an O_TMPFILE file is linked into the tmpfs, the
3960	 * first link must skip that, to get the accounting right.
3961	 */
3962	if (inode->i_nlink) {
3963		ret = shmem_reserve_inode(inode->i_sb, NULL);
3964		if (ret)
3965			return ret;
3966	}
3967
3968	ret = simple_offset_add(shmem_get_offset_ctx(dir), dentry);
3969	if (ret) {
3970		if (inode->i_nlink)
3971			shmem_free_inode(inode->i_sb, 0);
3972		return ret;
3973	}
3974
3975	dir->i_size += BOGO_DIRENT_SIZE;
3976	inode_inc_iversion(dir);
3977	return simple_link(old_dentry, dir, dentry);
3978}
3979
3980static int shmem_unlink(struct inode *dir, struct dentry *dentry)
3981{
3982	struct inode *inode = d_inode(dentry);
3983
3984	if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode))
3985		shmem_free_inode(inode->i_sb, 0);
3986
3987	simple_offset_remove(shmem_get_offset_ctx(dir), dentry);
3988
3989	dir->i_size -= BOGO_DIRENT_SIZE;
3990	inode_inc_iversion(dir);
3991	simple_unlink(dir, dentry);
3992
3993	/*
3994	 * For now, VFS can't deal with case-insensitive negative dentries, so
3995	 * we invalidate them
3996	 */
3997	if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir))
3998		d_invalidate(dentry);
3999
4000	return 0;
4001}
4002
4003static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
4004{
4005	if (!simple_empty(dentry))
4006		return -ENOTEMPTY;
4007
4008	drop_nlink(d_inode(dentry));
4009	drop_nlink(dir);
4010	return shmem_unlink(dir, dentry);
4011}
4012
4013static int shmem_whiteout(struct mnt_idmap *idmap,
4014			  struct inode *old_dir, struct dentry *old_dentry)
4015{
4016	struct dentry *whiteout;
4017	int error;
4018
4019	whiteout = d_alloc(old_dentry->d_parent, &old_dentry->d_name);
4020	if (!whiteout)
4021		return -ENOMEM;
4022	error = shmem_mknod(idmap, old_dir, whiteout,
4023			    S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV);
4024	dput(whiteout);
4025	return error;
4026}
4027
4028/*
4029 * The VFS layer already does all the dentry stuff for rename,
4030 * we just have to decrement the usage count for the target if
4031 * it exists so that the VFS layer correctly free's it when it
4032 * gets overwritten.
4033 */
4034static int shmem_rename2(struct mnt_idmap *idmap,
4035			 struct inode *old_dir, struct dentry *old_dentry,
4036			 struct inode *new_dir, struct dentry *new_dentry,
4037			 unsigned int flags)
4038{
4039	struct inode *inode = d_inode(old_dentry);
4040	int they_are_dirs = S_ISDIR(inode->i_mode);
4041	bool had_offset = false;
4042	int error;
4043
4044	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
4045		return -EINVAL;
4046
4047	if (flags & RENAME_EXCHANGE)
4048		return simple_offset_rename_exchange(old_dir, old_dentry,
4049						     new_dir, new_dentry);
4050
4051	if (!simple_empty(new_dentry))
4052		return -ENOTEMPTY;
4053
4054	error = simple_offset_add(shmem_get_offset_ctx(new_dir), new_dentry);
4055	if (error == -EBUSY)
4056		had_offset = true;
4057	else if (unlikely(error))
4058		return error;
4059
4060	if (flags & RENAME_WHITEOUT) {
4061		error = shmem_whiteout(idmap, old_dir, old_dentry);
4062		if (error) {
4063			if (!had_offset)
4064				simple_offset_remove(shmem_get_offset_ctx(new_dir),
4065						     new_dentry);
4066			return error;
4067		}
4068	}
4069
4070	simple_offset_rename(old_dir, old_dentry, new_dir, new_dentry);
4071	if (d_really_is_positive(new_dentry)) {
4072		(void) shmem_unlink(new_dir, new_dentry);
4073		if (they_are_dirs) {
4074			drop_nlink(d_inode(new_dentry));
4075			drop_nlink(old_dir);
4076		}
4077	} else if (they_are_dirs) {
4078		drop_nlink(old_dir);
4079		inc_nlink(new_dir);
4080	}
4081
4082	old_dir->i_size -= BOGO_DIRENT_SIZE;
4083	new_dir->i_size += BOGO_DIRENT_SIZE;
4084	simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
4085	inode_inc_iversion(old_dir);
4086	inode_inc_iversion(new_dir);
4087	return 0;
4088}
4089
4090static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir,
4091			 struct dentry *dentry, const char *symname)
4092{
4093	int error;
4094	int len;
4095	struct inode *inode;
4096	struct folio *folio;
4097	char *link;
4098
4099	len = strlen(symname) + 1;
4100	if (len > PAGE_SIZE)
4101		return -ENAMETOOLONG;
4102
4103	inode = shmem_get_inode(idmap, dir->i_sb, dir, S_IFLNK | 0777, 0,
4104				VM_NORESERVE);
4105	if (IS_ERR(inode))
4106		return PTR_ERR(inode);
4107
4108	error = security_inode_init_security(inode, dir, &dentry->d_name,
4109					     shmem_initxattrs, NULL);
4110	if (error && error != -EOPNOTSUPP)
4111		goto out_iput;
4112
4113	error = simple_offset_add(shmem_get_offset_ctx(dir), dentry);
4114	if (error)
4115		goto out_iput;
4116
4117	inode->i_size = len-1;
4118	if (len <= SHORT_SYMLINK_LEN) {
4119		link = kmemdup(symname, len, GFP_KERNEL);
4120		if (!link) {
4121			error = -ENOMEM;
4122			goto out_remove_offset;
4123		}
4124		inode->i_op = &shmem_short_symlink_operations;
4125		inode_set_cached_link(inode, link, len - 1);
4126	} else {
4127		inode_nohighmem(inode);
4128		inode->i_mapping->a_ops = &shmem_aops;
4129		error = shmem_get_folio(inode, 0, 0, &folio, SGP_WRITE);
4130		if (error)
4131			goto out_remove_offset;
4132		inode->i_op = &shmem_symlink_inode_operations;
4133		memcpy(folio_address(folio), symname, len);
4134		folio_mark_uptodate(folio);
4135		folio_mark_dirty(folio);
4136		folio_unlock(folio);
4137		folio_put(folio);
4138	}
4139	dir->i_size += BOGO_DIRENT_SIZE;
4140	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
4141	inode_inc_iversion(dir);
4142	d_make_persistent(dentry, inode);
4143	return 0;
4144
4145out_remove_offset:
4146	simple_offset_remove(shmem_get_offset_ctx(dir), dentry);
4147out_iput:
4148	iput(inode);
4149	return error;
4150}
4151
4152static void shmem_put_link(void *arg)
4153{
4154	folio_mark_accessed(arg);
4155	folio_put(arg);
4156}
4157
4158static const char *shmem_get_link(struct dentry *dentry, struct inode *inode,
4159				  struct delayed_call *done)
4160{
4161	struct folio *folio = NULL;
4162	int error;
4163
4164	if (!dentry) {
4165		folio = filemap_get_folio(inode->i_mapping, 0);
4166		if (IS_ERR(folio))
4167			return ERR_PTR(-ECHILD);
4168		if (PageHWPoison(folio_page(folio, 0)) ||
4169		    !folio_test_uptodate(folio)) {
4170			folio_put(folio);
4171			return ERR_PTR(-ECHILD);
4172		}
4173	} else {
4174		error = shmem_get_folio(inode, 0, 0, &folio, SGP_READ);
4175		if (error)
4176			return ERR_PTR(error);
4177		if (!folio)
4178			return ERR_PTR(-ECHILD);
4179		if (PageHWPoison(folio_page(folio, 0))) {
4180			folio_unlock(folio);
4181			folio_put(folio);
4182			return ERR_PTR(-ECHILD);
4183		}
4184		folio_unlock(folio);
4185	}
4186	set_delayed_call(done, shmem_put_link, folio);
4187	return folio_address(folio);
4188}
4189
4190#ifdef CONFIG_TMPFS_XATTR
4191
4192static int shmem_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
4193{
4194	struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
4195
4196	fileattr_fill_flags(fa, info->fsflags & SHMEM_FL_USER_VISIBLE);
4197
4198	return 0;
4199}
4200
4201static int shmem_fileattr_set(struct mnt_idmap *idmap,
4202			      struct dentry *dentry, struct file_kattr *fa)
4203{
4204	struct inode *inode = d_inode(dentry);
4205	struct shmem_inode_info *info = SHMEM_I(inode);
4206	int ret, flags;
4207
4208	if (fileattr_has_fsx(fa))
4209		return -EOPNOTSUPP;
4210	if (fa->flags & ~SHMEM_FL_USER_MODIFIABLE)
4211		return -EOPNOTSUPP;
4212
4213	flags = (info->fsflags & ~SHMEM_FL_USER_MODIFIABLE) |
4214		(fa->flags & SHMEM_FL_USER_MODIFIABLE);
4215
4216	ret = shmem_set_inode_flags(inode, flags, dentry);
4217
4218	if (ret)
4219		return ret;
4220
4221	info->fsflags = flags;
4222
4223	inode_set_ctime_current(inode);
4224	inode_inc_iversion(inode);
4225	return 0;
4226}
4227
4228/*
4229 * Superblocks without xattr inode operations may get some security.* xattr
4230 * support from the LSM "for free". As soon as we have any other xattrs
4231 * like ACLs, we also need to implement the security.* handlers at
4232 * filesystem level, though.
4233 */
4234
4235/*
4236 * Callback for security_inode_init_security() for acquiring xattrs.
4237 */
4238static int shmem_initxattrs(struct inode *inode,
4239			    const struct xattr *xattr_array, void *fs_info)
4240{
4241	struct shmem_inode_info *info = SHMEM_I(inode);
4242	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
4243	const struct xattr *xattr;
4244	struct simple_xattr *new_xattr;
4245	size_t ispace = 0;
4246	size_t len;
4247
4248	if (sbinfo->max_inodes) {
4249		for (xattr = xattr_array; xattr->name != NULL; xattr++) {
4250			ispace += simple_xattr_space(xattr->name,
4251				xattr->value_len + XATTR_SECURITY_PREFIX_LEN);
4252		}
4253		if (ispace) {
4254			raw_spin_lock(&sbinfo->stat_lock);
4255			if (sbinfo->free_ispace < ispace)
4256				ispace = 0;
4257			else
4258				sbinfo->free_ispace -= ispace;
4259			raw_spin_unlock(&sbinfo->stat_lock);
4260			if (!ispace)
4261				return -ENOSPC;
4262		}
4263	}
4264
4265	for (xattr = xattr_array; xattr->name != NULL; xattr++) {
4266		new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len);
4267		if (!new_xattr)
4268			break;
4269
4270		len = strlen(xattr->name) + 1;
4271		new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len,
4272					  GFP_KERNEL_ACCOUNT);
4273		if (!new_xattr->name) {
4274			kvfree(new_xattr);
4275			break;
4276		}
4277
4278		memcpy(new_xattr->name, XATTR_SECURITY_PREFIX,
4279		       XATTR_SECURITY_PREFIX_LEN);
4280		memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN,
4281		       xattr->name, len);
4282
4283		simple_xattr_add(&info->xattrs, new_xattr);
4284	}
4285
4286	if (xattr->name != NULL) {
4287		if (ispace) {
4288			raw_spin_lock(&sbinfo->stat_lock);
4289			sbinfo->free_ispace += ispace;
4290			raw_spin_unlock(&sbinfo->stat_lock);
4291		}
4292		simple_xattrs_free(&info->xattrs, NULL);
4293		return -ENOMEM;
4294	}
4295
4296	return 0;
4297}
4298
4299static int shmem_xattr_handler_get(const struct xattr_handler *handler,
4300				   struct dentry *unused, struct inode *inode,
4301				   const char *name, void *buffer, size_t size)
4302{
4303	struct shmem_inode_info *info = SHMEM_I(inode);
4304
4305	name = xattr_full_name(handler, name);
4306	return simple_xattr_get(&info->xattrs, name, buffer, size);
4307}
4308
4309static int shmem_xattr_handler_set(const struct xattr_handler *handler,
4310				   struct mnt_idmap *idmap,
4311				   struct dentry *unused, struct inode *inode,
4312				   const char *name, const void *value,
4313				   size_t size, int flags)
4314{
4315	struct shmem_inode_info *info = SHMEM_I(inode);
4316	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
4317	struct simple_xattr *old_xattr;
4318	size_t ispace = 0;
4319
4320	name = xattr_full_name(handler, name);
4321	if (value && sbinfo->max_inodes) {
4322		ispace = simple_xattr_space(name, size);
4323		raw_spin_lock(&sbinfo->stat_lock);
4324		if (sbinfo->free_ispace < ispace)
4325			ispace = 0;
4326		else
4327			sbinfo->free_ispace -= ispace;
4328		raw_spin_unlock(&sbinfo->stat_lock);
4329		if (!ispace)
4330			return -ENOSPC;
4331	}
4332
4333	old_xattr = simple_xattr_set(&info->xattrs, name, value, size, flags);
4334	if (!IS_ERR(old_xattr)) {
4335		ispace = 0;
4336		if (old_xattr && sbinfo->max_inodes)
4337			ispace = simple_xattr_space(old_xattr->name,
4338						    old_xattr->size);
4339		simple_xattr_free(old_xattr);
4340		old_xattr = NULL;
4341		inode_set_ctime_current(inode);
4342		inode_inc_iversion(inode);
4343	}
4344	if (ispace) {
4345		raw_spin_lock(&sbinfo->stat_lock);
4346		sbinfo->free_ispace += ispace;
4347		raw_spin_unlock(&sbinfo->stat_lock);
4348	}
4349	return PTR_ERR(old_xattr);
4350}
4351
4352static const struct xattr_handler shmem_security_xattr_handler = {
4353	.prefix = XATTR_SECURITY_PREFIX,
4354	.get = shmem_xattr_handler_get,
4355	.set = shmem_xattr_handler_set,
4356};
4357
4358static const struct xattr_handler shmem_trusted_xattr_handler = {
4359	.prefix = XATTR_TRUSTED_PREFIX,
4360	.get = shmem_xattr_handler_get,
4361	.set = shmem_xattr_handler_set,
4362};
4363
4364static const struct xattr_handler shmem_user_xattr_handler = {
4365	.prefix = XATTR_USER_PREFIX,
4366	.get = shmem_xattr_handler_get,
4367	.set = shmem_xattr_handler_set,
4368};
4369
4370static const struct xattr_handler * const shmem_xattr_handlers[] = {
4371	&shmem_security_xattr_handler,
4372	&shmem_trusted_xattr_handler,
4373	&shmem_user_xattr_handler,
4374	NULL
4375};
4376
4377static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size)
4378{
4379	struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
4380	return simple_xattr_list(d_inode(dentry), &info->xattrs, buffer, size);
4381}
4382#endif /* CONFIG_TMPFS_XATTR */
4383
4384static const struct inode_operations shmem_short_symlink_operations = {
4385	.getattr	= shmem_getattr,
4386	.setattr	= shmem_setattr,
4387	.get_link	= simple_get_link,
4388#ifdef CONFIG_TMPFS_XATTR
4389	.listxattr	= shmem_listxattr,
4390#endif
4391};
4392
4393static const struct inode_operations shmem_symlink_inode_operations = {
4394	.getattr	= shmem_getattr,
4395	.setattr	= shmem_setattr,
4396	.get_link	= shmem_get_link,
4397#ifdef CONFIG_TMPFS_XATTR
4398	.listxattr	= shmem_listxattr,
4399#endif
4400};
4401
4402static struct dentry *shmem_get_parent(struct dentry *child)
4403{
4404	return ERR_PTR(-ESTALE);
4405}
4406
4407static int shmem_match(struct inode *ino, void *vfh)
4408{
4409	__u32 *fh = vfh;
4410	__u64 inum = fh[2];
4411	inum = (inum << 32) | fh[1];
4412	return ino->i_ino == inum && fh[0] == ino->i_generation;
4413}
4414
4415/* Find any alias of inode, but prefer a hashed alias */
4416static struct dentry *shmem_find_alias(struct inode *inode)
4417{
4418	struct dentry *alias = d_find_alias(inode);
4419
4420	return alias ?: d_find_any_alias(inode);
4421}
4422
4423static struct dentry *shmem_fh_to_dentry(struct super_block *sb,
4424		struct fid *fid, int fh_len, int fh_type)
4425{
4426	struct inode *inode;
4427	struct dentry *dentry = NULL;
4428	u64 inum;
4429
4430	if (fh_len < 3)
4431		return NULL;
4432
4433	inum = fid->raw[2];
4434	inum = (inum << 32) | fid->raw[1];
4435
4436	inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]),
4437			shmem_match, fid->raw);
4438	if (inode) {
4439		dentry = shmem_find_alias(inode);
4440		iput(inode);
4441	}
4442
4443	return dentry;
4444}
4445
4446static int shmem_encode_fh(struct inode *inode, __u32 *fh, int *len,
4447				struct inode *parent)
4448{
4449	if (*len < 3) {
4450		*len = 3;
4451		return FILEID_INVALID;
4452	}
4453
4454	if (inode_unhashed(inode)) {
4455		/* Unfortunately insert_inode_hash is not idempotent,
4456		 * so as we hash inodes here rather than at creation
4457		 * time, we need a lock to ensure we only try
4458		 * to do it once
4459		 */
4460		static DEFINE_SPINLOCK(lock);
4461		spin_lock(&lock);
4462		if (inode_unhashed(inode))
4463			__insert_inode_hash(inode,
4464					    inode->i_ino + inode->i_generation);
4465		spin_unlock(&lock);
4466	}
4467
4468	fh[0] = inode->i_generation;
4469	fh[1] = inode->i_ino;
4470	fh[2] = ((__u64)inode->i_ino) >> 32;
4471
4472	*len = 3;
4473	return 1;
4474}
4475
4476static const struct export_operations shmem_export_ops = {
4477	.get_parent     = shmem_get_parent,
4478	.encode_fh      = shmem_encode_fh,
4479	.fh_to_dentry	= shmem_fh_to_dentry,
4480};
4481
4482enum shmem_param {
4483	Opt_gid,
4484	Opt_huge,
4485	Opt_mode,
4486	Opt_mpol,
4487	Opt_nr_blocks,
4488	Opt_nr_inodes,
4489	Opt_size,
4490	Opt_uid,
4491	Opt_inode32,
4492	Opt_inode64,
4493	Opt_noswap,
4494	Opt_quota,
4495	Opt_usrquota,
4496	Opt_grpquota,
4497	Opt_usrquota_block_hardlimit,
4498	Opt_usrquota_inode_hardlimit,
4499	Opt_grpquota_block_hardlimit,
4500	Opt_grpquota_inode_hardlimit,
4501	Opt_casefold_version,
4502	Opt_casefold,
4503	Opt_strict_encoding,
4504};
4505
4506static const struct constant_table shmem_param_enums_huge[] = {
4507	{"never",	SHMEM_HUGE_NEVER },
4508	{"always",	SHMEM_HUGE_ALWAYS },
4509	{"within_size",	SHMEM_HUGE_WITHIN_SIZE },
4510	{"advise",	SHMEM_HUGE_ADVISE },
4511	{}
4512};
4513
4514const struct fs_parameter_spec shmem_fs_parameters[] = {
4515	fsparam_gid   ("gid",		Opt_gid),
4516	fsparam_enum  ("huge",		Opt_huge,  shmem_param_enums_huge),
4517	fsparam_u32oct("mode",		Opt_mode),
4518	fsparam_string("mpol",		Opt_mpol),
4519	fsparam_string("nr_blocks",	Opt_nr_blocks),
4520	fsparam_string("nr_inodes",	Opt_nr_inodes),
4521	fsparam_string("size",		Opt_size),
4522	fsparam_uid   ("uid",		Opt_uid),
4523	fsparam_flag  ("inode32",	Opt_inode32),
4524	fsparam_flag  ("inode64",	Opt_inode64),
4525	fsparam_flag  ("noswap",	Opt_noswap),
4526#ifdef CONFIG_TMPFS_QUOTA
4527	fsparam_flag  ("quota",		Opt_quota),
4528	fsparam_flag  ("usrquota",	Opt_usrquota),
4529	fsparam_flag  ("grpquota",	Opt_grpquota),
4530	fsparam_string("usrquota_block_hardlimit", Opt_usrquota_block_hardlimit),
4531	fsparam_string("usrquota_inode_hardlimit", Opt_usrquota_inode_hardlimit),
4532	fsparam_string("grpquota_block_hardlimit", Opt_grpquota_block_hardlimit),
4533	fsparam_string("grpquota_inode_hardlimit", Opt_grpquota_inode_hardlimit),
4534#endif
4535	fsparam_string("casefold",	Opt_casefold_version),
4536	fsparam_flag  ("casefold",	Opt_casefold),
4537	fsparam_flag  ("strict_encoding", Opt_strict_encoding),
4538	{}
4539};
4540
4541#if IS_ENABLED(CONFIG_UNICODE)
4542static int shmem_parse_opt_casefold(struct fs_context *fc, struct fs_parameter *param,
4543				    bool latest_version)
4544{
4545	struct shmem_options *ctx = fc->fs_private;
4546	int version = UTF8_LATEST;
4547	struct unicode_map *encoding;
4548	char *version_str = param->string + 5;
4549
4550	if (!latest_version) {
4551		if (strncmp(param->string, "utf8-", 5))
4552			return invalfc(fc, "Only UTF-8 encodings are supported "
4553				       "in the format: utf8-<version number>");
4554
4555		version = utf8_parse_version(version_str);
4556		if (version < 0)
4557			return invalfc(fc, "Invalid UTF-8 version: %s", version_str);
4558	}
4559
4560	encoding = utf8_load(version);
4561
4562	if (IS_ERR(encoding)) {
4563		return invalfc(fc, "Failed loading UTF-8 version: utf8-%u.%u.%u\n",
4564			       unicode_major(version), unicode_minor(version),
4565			       unicode_rev(version));
4566	}
4567
4568	pr_info("tmpfs: Using encoding : utf8-%u.%u.%u\n",
4569		unicode_major(version), unicode_minor(version), unicode_rev(version));
4570
4571	ctx->encoding = encoding;
4572
4573	return 0;
4574}
4575#else
4576static int shmem_parse_opt_casefold(struct fs_context *fc, struct fs_parameter *param,
4577				    bool latest_version)
4578{
4579	return invalfc(fc, "tmpfs: Kernel not built with CONFIG_UNICODE\n");
4580}
4581#endif
4582
4583static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
4584{
4585	struct shmem_options *ctx = fc->fs_private;
4586	struct fs_parse_result result;
4587	unsigned long long size;
4588	char *rest;
4589	int opt;
4590	kuid_t kuid;
4591	kgid_t kgid;
4592
4593	opt = fs_parse(fc, shmem_fs_parameters, param, &result);
4594	if (opt < 0)
4595		return opt;
4596
4597	switch (opt) {
4598	case Opt_size:
4599		size = memparse(param->string, &rest);
4600		if (*rest == '%') {
4601			size <<= PAGE_SHIFT;
4602			size *= totalram_pages();
4603			do_div(size, 100);
4604			rest++;
4605		}
4606		if (*rest)
4607			goto bad_value;
4608		ctx->blocks = DIV_ROUND_UP(size, PAGE_SIZE);
4609		ctx->seen |= SHMEM_SEEN_BLOCKS;
4610		break;
4611	case Opt_nr_blocks:
4612		ctx->blocks = memparse(param->string, &rest);
4613		if (*rest || ctx->blocks > LONG_MAX)
4614			goto bad_value;
4615		ctx->seen |= SHMEM_SEEN_BLOCKS;
4616		break;
4617	case Opt_nr_inodes:
4618		ctx->inodes = memparse(param->string, &rest);
4619		if (*rest || ctx->inodes > ULONG_MAX / BOGO_INODE_SIZE)
4620			goto bad_value;
4621		ctx->seen |= SHMEM_SEEN_INODES;
4622		break;
4623	case Opt_mode:
4624		ctx->mode = result.uint_32 & 07777;
4625		break;
4626	case Opt_uid:
4627		kuid = result.uid;
4628
4629		/*
4630		 * The requested uid must be representable in the
4631		 * filesystem's idmapping.
4632		 */
4633		if (!kuid_has_mapping(fc->user_ns, kuid))
4634			goto bad_value;
4635
4636		ctx->uid = kuid;
4637		break;
4638	case Opt_gid:
4639		kgid = result.gid;
4640
4641		/*
4642		 * The requested gid must be representable in the
4643		 * filesystem's idmapping.
4644		 */
4645		if (!kgid_has_mapping(fc->user_ns, kgid))
4646			goto bad_value;
4647
4648		ctx->gid = kgid;
4649		break;
4650	case Opt_huge:
4651		ctx->huge = result.uint_32;
4652		if (ctx->huge != SHMEM_HUGE_NEVER &&
4653		    !(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
4654		      has_transparent_hugepage()))
4655			goto unsupported_parameter;
4656		ctx->seen |= SHMEM_SEEN_HUGE;
4657		break;
4658	case Opt_mpol:
4659		if (IS_ENABLED(CONFIG_NUMA)) {
4660			mpol_put(ctx->mpol);
4661			ctx->mpol = NULL;
4662			if (mpol_parse_str(param->string, &ctx->mpol))
4663				goto bad_value;
4664			break;
4665		}
4666		goto unsupported_parameter;
4667	case Opt_inode32:
4668		ctx->full_inums = false;
4669		ctx->seen |= SHMEM_SEEN_INUMS;
4670		break;
4671	case Opt_inode64:
4672		if (sizeof(ino_t) < 8) {
4673			return invalfc(fc,
4674				       "Cannot use inode64 with <64bit inums in kernel\n");
4675		}
4676		ctx->full_inums = true;
4677		ctx->seen |= SHMEM_SEEN_INUMS;
4678		break;
4679	case Opt_noswap:
4680		if ((fc->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN)) {
4681			return invalfc(fc,
4682				       "Turning off swap in unprivileged tmpfs mounts unsupported");
4683		}
4684		ctx->noswap = true;
4685		break;
4686	case Opt_quota:
4687		if (fc->user_ns != &init_user_ns)
4688			return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported");
4689		ctx->seen |= SHMEM_SEEN_QUOTA;
4690		ctx->quota_types |= (QTYPE_MASK_USR | QTYPE_MASK_GRP);
4691		break;
4692	case Opt_usrquota:
4693		if (fc->user_ns != &init_user_ns)
4694			return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported");
4695		ctx->seen |= SHMEM_SEEN_QUOTA;
4696		ctx->quota_types |= QTYPE_MASK_USR;
4697		break;
4698	case Opt_grpquota:
4699		if (fc->user_ns != &init_user_ns)
4700			return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported");
4701		ctx->seen |= SHMEM_SEEN_QUOTA;
4702		ctx->quota_types |= QTYPE_MASK_GRP;
4703		break;
4704	case Opt_usrquota_block_hardlimit:
4705		size = memparse(param->string, &rest);
4706		if (*rest || !size)
4707			goto bad_value;
4708		if (size > SHMEM_QUOTA_MAX_SPC_LIMIT)
4709			return invalfc(fc,
4710				       "User quota block hardlimit too large.");
4711		ctx->qlimits.usrquota_bhardlimit = size;
4712		break;
4713	case Opt_grpquota_block_hardlimit:
4714		size = memparse(param->string, &rest);
4715		if (*rest || !size)
4716			goto bad_value;
4717		if (size > SHMEM_QUOTA_MAX_SPC_LIMIT)
4718			return invalfc(fc,
4719				       "Group quota block hardlimit too large.");
4720		ctx->qlimits.grpquota_bhardlimit = size;
4721		break;
4722	case Opt_usrquota_inode_hardlimit:
4723		size = memparse(param->string, &rest);
4724		if (*rest || !size)
4725			goto bad_value;
4726		if (size > SHMEM_QUOTA_MAX_INO_LIMIT)
4727			return invalfc(fc,
4728				       "User quota inode hardlimit too large.");
4729		ctx->qlimits.usrquota_ihardlimit = size;
4730		break;
4731	case Opt_grpquota_inode_hardlimit:
4732		size = memparse(param->string, &rest);
4733		if (*rest || !size)
4734			goto bad_value;
4735		if (size > SHMEM_QUOTA_MAX_INO_LIMIT)
4736			return invalfc(fc,
4737				       "Group quota inode hardlimit too large.");
4738		ctx->qlimits.grpquota_ihardlimit = size;
4739		break;
4740	case Opt_casefold_version:
4741		return shmem_parse_opt_casefold(fc, param, false);
4742	case Opt_casefold:
4743		return shmem_parse_opt_casefold(fc, param, true);
4744	case Opt_strict_encoding:
4745#if IS_ENABLED(CONFIG_UNICODE)
4746		ctx->strict_encoding = true;
4747		break;
4748#else
4749		return invalfc(fc, "tmpfs: Kernel not built with CONFIG_UNICODE\n");
4750#endif
4751	}
4752	return 0;
4753
4754unsupported_parameter:
4755	return invalfc(fc, "Unsupported parameter '%s'", param->key);
4756bad_value:
4757	return invalfc(fc, "Bad value for '%s'", param->key);
4758}
4759
4760static char *shmem_next_opt(char **s)
4761{
4762	char *sbegin = *s;
4763	char *p;
4764
4765	if (sbegin == NULL)
4766		return NULL;
4767
4768	/*
4769	 * NUL-terminate this option: unfortunately,
4770	 * mount options form a comma-separated list,
4771	 * but mpol's nodelist may also contain commas.
4772	 */
4773	for (;;) {
4774		p = strchr(*s, ',');
4775		if (p == NULL)
4776			break;
4777		*s = p + 1;
4778		if (!isdigit(*(p+1))) {
4779			*p = '\0';
4780			return sbegin;
4781		}
4782	}
4783
4784	*s = NULL;
4785	return sbegin;
4786}
4787
4788static int shmem_parse_monolithic(struct fs_context *fc, void *data)
4789{
4790	return vfs_parse_monolithic_sep(fc, data, shmem_next_opt);
4791}
4792
4793/*
4794 * Reconfigure a shmem filesystem.
4795 */
4796static int shmem_reconfigure(struct fs_context *fc)
4797{
4798	struct shmem_options *ctx = fc->fs_private;
4799	struct shmem_sb_info *sbinfo = SHMEM_SB(fc->root->d_sb);
4800	unsigned long used_isp;
4801	struct mempolicy *mpol = NULL;
4802	const char *err;
4803
4804	raw_spin_lock(&sbinfo->stat_lock);
4805	used_isp = sbinfo->max_inodes * BOGO_INODE_SIZE - sbinfo->free_ispace;
4806
4807	if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) {
4808		if (!sbinfo->max_blocks) {
4809			err = "Cannot retroactively limit size";
4810			goto out;
4811		}
4812		if (percpu_counter_compare(&sbinfo->used_blocks,
4813					   ctx->blocks) > 0) {
4814			err = "Too small a size for current use";
4815			goto out;
4816		}
4817	}
4818	if ((ctx->seen & SHMEM_SEEN_INODES) && ctx->inodes) {
4819		if (!sbinfo->max_inodes) {
4820			err = "Cannot retroactively limit inodes";
4821			goto out;
4822		}
4823		if (ctx->inodes * BOGO_INODE_SIZE < used_isp) {
4824			err = "Too few inodes for current use";
4825			goto out;
4826		}
4827	}
4828
4829	if ((ctx->seen & SHMEM_SEEN_INUMS) && !ctx->full_inums &&
4830	    sbinfo->next_ino > UINT_MAX) {
4831		err = "Current inum too high to switch to 32-bit inums";
4832		goto out;
4833	}
4834
4835	/*
4836	 * "noswap" doesn't use fsparam_flag_no, i.e. there's no "swap"
4837	 * counterpart for (re-)enabling swap.
4838	 */
4839	if (ctx->noswap && !sbinfo->noswap) {
4840		err = "Cannot disable swap on remount";
4841		goto out;
4842	}
4843
4844	if (ctx->seen & SHMEM_SEEN_QUOTA &&
4845	    !sb_any_quota_loaded(fc->root->d_sb)) {
4846		err = "Cannot enable quota on remount";
4847		goto out;
4848	}
4849
4850#ifdef CONFIG_TMPFS_QUOTA
4851#define CHANGED_LIMIT(name)						\
4852	(ctx->qlimits.name## hardlimit &&				\
4853	(ctx->qlimits.name## hardlimit != sbinfo->qlimits.name## hardlimit))
4854
4855	if (CHANGED_LIMIT(usrquota_b) || CHANGED_LIMIT(usrquota_i) ||
4856	    CHANGED_LIMIT(grpquota_b) || CHANGED_LIMIT(grpquota_i)) {
4857		err = "Cannot change global quota limit on remount";
4858		goto out;
4859	}
4860#endif /* CONFIG_TMPFS_QUOTA */
4861
4862	if (ctx->seen & SHMEM_SEEN_HUGE)
4863		sbinfo->huge = ctx->huge;
4864	if (ctx->seen & SHMEM_SEEN_INUMS)
4865		sbinfo->full_inums = ctx->full_inums;
4866	if (ctx->seen & SHMEM_SEEN_BLOCKS)
4867		sbinfo->max_blocks  = ctx->blocks;
4868	if (ctx->seen & SHMEM_SEEN_INODES) {
4869		sbinfo->max_inodes  = ctx->inodes;
4870		sbinfo->free_ispace = ctx->inodes * BOGO_INODE_SIZE - used_isp;
4871	}
4872
4873	/*
4874	 * Preserve previous mempolicy unless mpol remount option was specified.
4875	 */
4876	if (ctx->mpol) {
4877		mpol = sbinfo->mpol;
4878		sbinfo->mpol = ctx->mpol;	/* transfers initial ref */
4879		ctx->mpol = NULL;
4880	}
4881
4882	if (ctx->noswap)
4883		sbinfo->noswap = true;
4884
4885	raw_spin_unlock(&sbinfo->stat_lock);
4886	mpol_put(mpol);
4887	return 0;
4888out:
4889	raw_spin_unlock(&sbinfo->stat_lock);
4890	return invalfc(fc, "%s", err);
4891}
4892
4893static int shmem_show_options(struct seq_file *seq, struct dentry *root)
4894{
4895	struct shmem_sb_info *sbinfo = SHMEM_SB(root->d_sb);
4896	struct mempolicy *mpol;
4897
4898	if (sbinfo->max_blocks != shmem_default_max_blocks())
4899		seq_printf(seq, ",size=%luk", K(sbinfo->max_blocks));
4900	if (sbinfo->max_inodes != shmem_default_max_inodes())
4901		seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes);
4902	if (sbinfo->mode != (0777 | S_ISVTX))
4903		seq_printf(seq, ",mode=%03ho", sbinfo->mode);
4904	if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID))
4905		seq_printf(seq, ",uid=%u",
4906				from_kuid_munged(&init_user_ns, sbinfo->uid));
4907	if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID))
4908		seq_printf(seq, ",gid=%u",
4909				from_kgid_munged(&init_user_ns, sbinfo->gid));
4910
4911	/*
4912	 * Showing inode{64,32} might be useful even if it's the system default,
4913	 * since then people don't have to resort to checking both here and
4914	 * /proc/config.gz to confirm 64-bit inums were successfully applied
4915	 * (which may not even exist if IKCONFIG_PROC isn't enabled).
4916	 *
4917	 * We hide it when inode64 isn't the default and we are using 32-bit
4918	 * inodes, since that probably just means the feature isn't even under
4919	 * consideration.
4920	 *
4921	 * As such:
4922	 *
4923	 *                     +-----------------+-----------------+
4924	 *                     | TMPFS_INODE64=y | TMPFS_INODE64=n |
4925	 *  +------------------+-----------------+-----------------+
4926	 *  | full_inums=true  | show            | show            |
4927	 *  | full_inums=false | show            | hide            |
4928	 *  +------------------+-----------------+-----------------+
4929	 *
4930	 */
4931	if (IS_ENABLED(CONFIG_TMPFS_INODE64) || sbinfo->full_inums)
4932		seq_printf(seq, ",inode%d", (sbinfo->full_inums ? 64 : 32));
4933#ifdef CONFIG_TRANSPARENT_HUGEPAGE
4934	/* Rightly or wrongly, show huge mount option unmasked by shmem_huge */
4935	if (sbinfo->huge)
4936		seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge));
4937#endif
4938	mpol = shmem_get_sbmpol(sbinfo);
4939	shmem_show_mpol(seq, mpol);
4940	mpol_put(mpol);
4941	if (sbinfo->noswap)
4942		seq_printf(seq, ",noswap");
4943#ifdef CONFIG_TMPFS_QUOTA
4944	if (sb_has_quota_active(root->d_sb, USRQUOTA))
4945		seq_printf(seq, ",usrquota");
4946	if (sb_has_quota_active(root->d_sb, GRPQUOTA))
4947		seq_printf(seq, ",grpquota");
4948	if (sbinfo->qlimits.usrquota_bhardlimit)
4949		seq_printf(seq, ",usrquota_block_hardlimit=%lld",
4950			   sbinfo->qlimits.usrquota_bhardlimit);
4951	if (sbinfo->qlimits.grpquota_bhardlimit)
4952		seq_printf(seq, ",grpquota_block_hardlimit=%lld",
4953			   sbinfo->qlimits.grpquota_bhardlimit);
4954	if (sbinfo->qlimits.usrquota_ihardlimit)
4955		seq_printf(seq, ",usrquota_inode_hardlimit=%lld",
4956			   sbinfo->qlimits.usrquota_ihardlimit);
4957	if (sbinfo->qlimits.grpquota_ihardlimit)
4958		seq_printf(seq, ",grpquota_inode_hardlimit=%lld",
4959			   sbinfo->qlimits.grpquota_ihardlimit);
4960#endif
4961	return 0;
4962}
4963
4964#endif /* CONFIG_TMPFS */
4965
4966static void shmem_put_super(struct super_block *sb)
4967{
4968	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
4969
4970#if IS_ENABLED(CONFIG_UNICODE)
4971	if (sb->s_encoding)
4972		utf8_unload(sb->s_encoding);
4973#endif
4974
4975#ifdef CONFIG_TMPFS_QUOTA
4976	shmem_disable_quotas(sb);
4977#endif
4978	free_percpu(sbinfo->ino_batch);
4979	percpu_counter_destroy(&sbinfo->used_blocks);
4980	mpol_put(sbinfo->mpol);
4981	kfree(sbinfo);
4982	sb->s_fs_info = NULL;
4983}
4984
4985#if IS_ENABLED(CONFIG_UNICODE) && defined(CONFIG_TMPFS)
4986static const struct dentry_operations shmem_ci_dentry_ops = {
4987	.d_hash = generic_ci_d_hash,
4988	.d_compare = generic_ci_d_compare,
4989};
4990#endif
4991
4992static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
4993{
4994	struct shmem_options *ctx = fc->fs_private;
4995	struct inode *inode;
4996	struct shmem_sb_info *sbinfo;
4997	int error = -ENOMEM;
4998
4999	/* Round up to L1_CACHE_BYTES to resist false sharing */
5000	sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info),
5001				L1_CACHE_BYTES), GFP_KERNEL);
5002	if (!sbinfo)
5003		return error;
5004
5005	sb->s_fs_info = sbinfo;
5006
5007#ifdef CONFIG_TMPFS
5008	/*
5009	 * Per default we only allow half of the physical ram per
5010	 * tmpfs instance, limiting inodes to one per page of lowmem;
5011	 * but the internal instance is left unlimited.
5012	 */
5013	if (!(sb->s_flags & SB_KERNMOUNT)) {
5014		if (!(ctx->seen & SHMEM_SEEN_BLOCKS))
5015			ctx->blocks = shmem_default_max_blocks();
5016		if (!(ctx->seen & SHMEM_SEEN_INODES))
5017			ctx->inodes = shmem_default_max_inodes();
5018		if (!(ctx->seen & SHMEM_SEEN_INUMS))
5019			ctx->full_inums = IS_ENABLED(CONFIG_TMPFS_INODE64);
5020		sbinfo->noswap = ctx->noswap;
5021	} else {
5022		sb->s_flags |= SB_NOUSER;
5023	}
5024	sb->s_export_op = &shmem_export_ops;
5025	sb->s_flags |= SB_NOSEC;
5026
5027#if IS_ENABLED(CONFIG_UNICODE)
5028	if (!ctx->encoding && ctx->strict_encoding) {
5029		pr_err("tmpfs: strict_encoding option without encoding is forbidden\n");
5030		error = -EINVAL;
5031		goto failed;
5032	}
5033
5034	if (ctx->encoding) {
5035		sb->s_encoding = ctx->encoding;
5036		set_default_d_op(sb, &shmem_ci_dentry_ops);
5037		if (ctx->strict_encoding)
5038			sb->s_encoding_flags = SB_ENC_STRICT_MODE_FL;
5039	}
5040#endif
5041
5042#else
5043	sb->s_flags |= SB_NOUSER;
5044#endif /* CONFIG_TMPFS */
5045	sb->s_d_flags |= DCACHE_DONTCACHE;
5046	sbinfo->max_blocks = ctx->blocks;
5047	sbinfo->max_inodes = ctx->inodes;
5048	sbinfo->free_ispace = sbinfo->max_inodes * BOGO_INODE_SIZE;
5049	if (sb->s_flags & SB_KERNMOUNT) {
5050		sbinfo->ino_batch = alloc_percpu(ino_t);
5051		if (!sbinfo->ino_batch)
5052			goto failed;
5053	}
5054	sbinfo->uid = ctx->uid;
5055	sbinfo->gid = ctx->gid;
5056	sbinfo->full_inums = ctx->full_inums;
5057	sbinfo->mode = ctx->mode;
5058#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5059	if (ctx->seen & SHMEM_SEEN_HUGE)
5060		sbinfo->huge = ctx->huge;
5061	else
5062		sbinfo->huge = tmpfs_huge;
5063#endif
5064	sbinfo->mpol = ctx->mpol;
5065	ctx->mpol = NULL;
5066
5067	raw_spin_lock_init(&sbinfo->stat_lock);
5068	if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL))
5069		goto failed;
5070	spin_lock_init(&sbinfo->shrinklist_lock);
5071	INIT_LIST_HEAD(&sbinfo->shrinklist);
5072
5073	sb->s_maxbytes = MAX_LFS_FILESIZE;
5074	sb->s_blocksize = PAGE_SIZE;
5075	sb->s_blocksize_bits = PAGE_SHIFT;
5076	sb->s_magic = TMPFS_MAGIC;
5077	sb->s_op = &shmem_ops;
5078	sb->s_time_gran = 1;
5079#ifdef CONFIG_TMPFS_XATTR
5080	sb->s_xattr = shmem_xattr_handlers;
5081#endif
5082#ifdef CONFIG_TMPFS_POSIX_ACL
5083	sb->s_flags |= SB_POSIXACL;
5084#endif
5085	uuid_t uuid;
5086	uuid_gen(&uuid);
5087	super_set_uuid(sb, uuid.b, sizeof(uuid));
5088
5089#ifdef CONFIG_TMPFS_QUOTA
5090	if (ctx->seen & SHMEM_SEEN_QUOTA) {
5091		sb->dq_op = &shmem_quota_operations;
5092		sb->s_qcop = &dquot_quotactl_sysfile_ops;
5093		sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
5094
5095		/* Copy the default limits from ctx into sbinfo */
5096		memcpy(&sbinfo->qlimits, &ctx->qlimits,
5097		       sizeof(struct shmem_quota_limits));
5098
5099		if (shmem_enable_quotas(sb, ctx->quota_types))
5100			goto failed;
5101	}
5102#endif /* CONFIG_TMPFS_QUOTA */
5103
5104	inode = shmem_get_inode(&nop_mnt_idmap, sb, NULL,
5105				S_IFDIR | sbinfo->mode, 0, VM_NORESERVE);
5106	if (IS_ERR(inode)) {
5107		error = PTR_ERR(inode);
5108		goto failed;
5109	}
5110	inode->i_uid = sbinfo->uid;
5111	inode->i_gid = sbinfo->gid;
5112	sb->s_root = d_make_root(inode);
5113	if (!sb->s_root)
5114		goto failed;
5115	return 0;
5116
5117failed:
5118	shmem_put_super(sb);
5119	return error;
5120}
5121
5122static int shmem_get_tree(struct fs_context *fc)
5123{
5124	return get_tree_nodev(fc, shmem_fill_super);
5125}
5126
5127static void shmem_free_fc(struct fs_context *fc)
5128{
5129	struct shmem_options *ctx = fc->fs_private;
5130
5131	if (ctx) {
5132		mpol_put(ctx->mpol);
5133		kfree(ctx);
5134	}
5135}
5136
5137static const struct fs_context_operations shmem_fs_context_ops = {
5138	.free			= shmem_free_fc,
5139	.get_tree		= shmem_get_tree,
5140#ifdef CONFIG_TMPFS
5141	.parse_monolithic	= shmem_parse_monolithic,
5142	.parse_param		= shmem_parse_one,
5143	.reconfigure		= shmem_reconfigure,
5144#endif
5145};
5146
5147static struct kmem_cache *shmem_inode_cachep __ro_after_init;
5148
5149static struct inode *shmem_alloc_inode(struct super_block *sb)
5150{
5151	struct shmem_inode_info *info;
5152	info = alloc_inode_sb(sb, shmem_inode_cachep, GFP_KERNEL);
5153	if (!info)
5154		return NULL;
5155	return &info->vfs_inode;
5156}
5157
5158static void shmem_free_in_core_inode(struct inode *inode)
5159{
5160	if (S_ISLNK(inode->i_mode))
5161		kfree(inode->i_link);
5162	kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
5163}
5164
5165static void shmem_destroy_inode(struct inode *inode)
5166{
5167	if (S_ISREG(inode->i_mode))
5168		mpol_free_shared_policy(&SHMEM_I(inode)->policy);
5169	if (S_ISDIR(inode->i_mode))
5170		simple_offset_destroy(shmem_get_offset_ctx(inode));
5171}
5172
5173static void shmem_init_inode(void *foo)
5174{
5175	struct shmem_inode_info *info = foo;
5176	inode_init_once(&info->vfs_inode);
5177}
5178
5179static void __init shmem_init_inodecache(void)
5180{
5181	shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
5182				sizeof(struct shmem_inode_info),
5183				0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode);
5184}
5185
5186static void __init shmem_destroy_inodecache(void)
5187{
5188	kmem_cache_destroy(shmem_inode_cachep);
5189}
5190
5191/* Keep the page in page cache instead of truncating it */
5192static int shmem_error_remove_folio(struct address_space *mapping,
5193				   struct folio *folio)
5194{
5195	return 0;
5196}
5197
5198static const struct address_space_operations shmem_aops = {
5199	.dirty_folio	= noop_dirty_folio,
5200#ifdef CONFIG_TMPFS
5201	.write_begin	= shmem_write_begin,
5202	.write_end	= shmem_write_end,
5203#endif
5204#ifdef CONFIG_MIGRATION
5205	.migrate_folio	= migrate_folio,
5206#endif
5207	.error_remove_folio = shmem_error_remove_folio,
5208};
5209
5210static const struct file_operations shmem_file_operations = {
5211	.mmap_prepare	= shmem_mmap_prepare,
5212	.open		= shmem_file_open,
5213	.get_unmapped_area = shmem_get_unmapped_area,
5214#ifdef CONFIG_TMPFS
5215	.llseek		= shmem_file_llseek,
5216	.read_iter	= shmem_file_read_iter,
5217	.write_iter	= shmem_file_write_iter,
5218	.fsync		= noop_fsync,
5219	.splice_read	= shmem_file_splice_read,
5220	.splice_write	= iter_file_splice_write,
5221	.fallocate	= shmem_fallocate,
5222#endif
5223};
5224
5225static const struct inode_operations shmem_inode_operations = {
5226	.getattr	= shmem_getattr,
5227	.setattr	= shmem_setattr,
5228#ifdef CONFIG_TMPFS_XATTR
5229	.listxattr	= shmem_listxattr,
5230	.set_acl	= simple_set_acl,
5231	.fileattr_get	= shmem_fileattr_get,
5232	.fileattr_set	= shmem_fileattr_set,
5233#endif
5234};
5235
5236static const struct inode_operations shmem_dir_inode_operations = {
5237#ifdef CONFIG_TMPFS
5238	.getattr	= shmem_getattr,
5239	.create		= shmem_create,
5240	.lookup		= simple_lookup,
5241	.link		= shmem_link,
5242	.unlink		= shmem_unlink,
5243	.symlink	= shmem_symlink,
5244	.mkdir		= shmem_mkdir,
5245	.rmdir		= shmem_rmdir,
5246	.mknod		= shmem_mknod,
5247	.rename		= shmem_rename2,
5248	.tmpfile	= shmem_tmpfile,
5249	.get_offset_ctx	= shmem_get_offset_ctx,
5250#endif
5251#ifdef CONFIG_TMPFS_XATTR
5252	.listxattr	= shmem_listxattr,
5253	.fileattr_get	= shmem_fileattr_get,
5254	.fileattr_set	= shmem_fileattr_set,
5255#endif
5256#ifdef CONFIG_TMPFS_POSIX_ACL
5257	.setattr	= shmem_setattr,
5258	.set_acl	= simple_set_acl,
5259#endif
5260};
5261
5262static const struct inode_operations shmem_special_inode_operations = {
5263	.getattr	= shmem_getattr,
5264#ifdef CONFIG_TMPFS_XATTR
5265	.listxattr	= shmem_listxattr,
5266#endif
5267#ifdef CONFIG_TMPFS_POSIX_ACL
5268	.setattr	= shmem_setattr,
5269	.set_acl	= simple_set_acl,
5270#endif
5271};
5272
5273static const struct super_operations shmem_ops = {
5274	.alloc_inode	= shmem_alloc_inode,
5275	.free_inode	= shmem_free_in_core_inode,
5276	.destroy_inode	= shmem_destroy_inode,
5277#ifdef CONFIG_TMPFS
5278	.statfs		= shmem_statfs,
5279	.show_options	= shmem_show_options,
5280#endif
5281#ifdef CONFIG_TMPFS_QUOTA
5282	.get_dquots	= shmem_get_dquots,
5283#endif
5284	.evict_inode	= shmem_evict_inode,
5285	.drop_inode	= inode_just_drop,
5286	.put_super	= shmem_put_super,
5287#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5288	.nr_cached_objects	= shmem_unused_huge_count,
5289	.free_cached_objects	= shmem_unused_huge_scan,
5290#endif
5291};
5292
5293static const struct vm_operations_struct shmem_vm_ops = {
5294	.fault		= shmem_fault,
5295	.map_pages	= filemap_map_pages,
5296#ifdef CONFIG_NUMA
5297	.set_policy     = shmem_set_policy,
5298	.get_policy     = shmem_get_policy,
5299#endif
5300};
5301
5302static const struct vm_operations_struct shmem_anon_vm_ops = {
5303	.fault		= shmem_fault,
5304	.map_pages	= filemap_map_pages,
5305#ifdef CONFIG_NUMA
5306	.set_policy     = shmem_set_policy,
5307	.get_policy     = shmem_get_policy,
5308#endif
5309};
5310
5311int shmem_init_fs_context(struct fs_context *fc)
5312{
5313	struct shmem_options *ctx;
5314
5315	ctx = kzalloc(sizeof(struct shmem_options), GFP_KERNEL);
5316	if (!ctx)
5317		return -ENOMEM;
5318
5319	ctx->mode = 0777 | S_ISVTX;
5320	ctx->uid = current_fsuid();
5321	ctx->gid = current_fsgid();
5322
5323#if IS_ENABLED(CONFIG_UNICODE)
5324	ctx->encoding = NULL;
5325#endif
5326
5327	fc->fs_private = ctx;
5328	fc->ops = &shmem_fs_context_ops;
5329#ifdef CONFIG_TMPFS
5330	fc->sb_flags |= SB_I_VERSION;
5331#endif
5332	return 0;
5333}
5334
5335static struct file_system_type shmem_fs_type = {
5336	.owner		= THIS_MODULE,
5337	.name		= "tmpfs",
5338	.init_fs_context = shmem_init_fs_context,
5339#ifdef CONFIG_TMPFS
5340	.parameters	= shmem_fs_parameters,
5341#endif
5342	.kill_sb	= kill_anon_super,
5343	.fs_flags	= FS_USERNS_MOUNT | FS_ALLOW_IDMAP | FS_MGTIME,
5344};
5345
5346#if defined(CONFIG_SYSFS) && defined(CONFIG_TMPFS)
5347
5348#define __INIT_KOBJ_ATTR(_name, _mode, _show, _store)			\
5349{									\
5350	.attr	= { .name = __stringify(_name), .mode = _mode },	\
5351	.show	= _show,						\
5352	.store	= _store,						\
5353}
5354
5355#define TMPFS_ATTR_W(_name, _store)				\
5356	static struct kobj_attribute tmpfs_attr_##_name =	\
5357			__INIT_KOBJ_ATTR(_name, 0200, NULL, _store)
5358
5359#define TMPFS_ATTR_RW(_name, _show, _store)			\
5360	static struct kobj_attribute tmpfs_attr_##_name =	\
5361			__INIT_KOBJ_ATTR(_name, 0644, _show, _store)
5362
5363#define TMPFS_ATTR_RO(_name, _show)				\
5364	static struct kobj_attribute tmpfs_attr_##_name =	\
5365			__INIT_KOBJ_ATTR(_name, 0444, _show, NULL)
5366
5367#if IS_ENABLED(CONFIG_UNICODE)
5368static ssize_t casefold_show(struct kobject *kobj, struct kobj_attribute *a,
5369			char *buf)
5370{
5371		return sysfs_emit(buf, "supported\n");
5372}
5373TMPFS_ATTR_RO(casefold, casefold_show);
5374#endif
5375
5376static struct attribute *tmpfs_attributes[] = {
5377#if IS_ENABLED(CONFIG_UNICODE)
5378	&tmpfs_attr_casefold.attr,
5379#endif
5380	NULL
5381};
5382
5383static const struct attribute_group tmpfs_attribute_group = {
5384	.attrs = tmpfs_attributes,
5385	.name = "features"
5386};
5387
5388static struct kobject *tmpfs_kobj;
5389
5390static int __init tmpfs_sysfs_init(void)
5391{
5392	int ret;
5393
5394	tmpfs_kobj = kobject_create_and_add("tmpfs", fs_kobj);
5395	if (!tmpfs_kobj)
5396		return -ENOMEM;
5397
5398	ret = sysfs_create_group(tmpfs_kobj, &tmpfs_attribute_group);
5399	if (ret)
5400		kobject_put(tmpfs_kobj);
5401
5402	return ret;
5403}
5404#endif /* CONFIG_SYSFS && CONFIG_TMPFS */
5405
5406void __init shmem_init(void)
5407{
5408	int error;
5409
5410	shmem_init_inodecache();
5411
5412#ifdef CONFIG_TMPFS_QUOTA
5413	register_quota_format(&shmem_quota_format);
5414#endif
5415
5416	error = register_filesystem(&shmem_fs_type);
5417	if (error) {
5418		pr_err("Could not register tmpfs\n");
5419		goto out2;
5420	}
5421
5422	shm_mnt = kern_mount(&shmem_fs_type);
5423	if (IS_ERR(shm_mnt)) {
5424		error = PTR_ERR(shm_mnt);
5425		pr_err("Could not kern_mount tmpfs\n");
5426		goto out1;
5427	}
5428
5429#if defined(CONFIG_SYSFS) && defined(CONFIG_TMPFS)
5430	error = tmpfs_sysfs_init();
5431	if (error) {
5432		pr_err("Could not init tmpfs sysfs\n");
5433		goto out1;
5434	}
5435#endif
5436
5437#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5438	if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY)
5439		SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
5440	else
5441		shmem_huge = SHMEM_HUGE_NEVER; /* just in case it was patched */
5442
5443	/*
5444	 * Default to setting PMD-sized THP to inherit the global setting and
5445	 * disable all other multi-size THPs.
5446	 */
5447	if (!shmem_orders_configured)
5448		huge_shmem_orders_inherit = BIT(HPAGE_PMD_ORDER);
5449#endif
5450	return;
5451
5452out1:
5453	unregister_filesystem(&shmem_fs_type);
5454out2:
5455#ifdef CONFIG_TMPFS_QUOTA
5456	unregister_quota_format(&shmem_quota_format);
5457#endif
5458	shmem_destroy_inodecache();
5459	shm_mnt = ERR_PTR(error);
5460}
5461
5462#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_SYSFS)
5463static ssize_t shmem_enabled_show(struct kobject *kobj,
5464				  struct kobj_attribute *attr, char *buf)
5465{
5466	static const int values[] = {
5467		SHMEM_HUGE_ALWAYS,
5468		SHMEM_HUGE_WITHIN_SIZE,
5469		SHMEM_HUGE_ADVISE,
5470		SHMEM_HUGE_NEVER,
5471		SHMEM_HUGE_DENY,
5472		SHMEM_HUGE_FORCE,
5473	};
5474	int len = 0;
5475	int i;
5476
5477	for (i = 0; i < ARRAY_SIZE(values); i++) {
5478		len += sysfs_emit_at(buf, len,
5479				shmem_huge == values[i] ? "%s[%s]" : "%s%s",
5480				i ? " " : "", shmem_format_huge(values[i]));
5481	}
5482	len += sysfs_emit_at(buf, len, "\n");
5483
5484	return len;
5485}
5486
5487static ssize_t shmem_enabled_store(struct kobject *kobj,
5488		struct kobj_attribute *attr, const char *buf, size_t count)
5489{
5490	char tmp[16];
5491	int huge, err;
5492
5493	if (count + 1 > sizeof(tmp))
5494		return -EINVAL;
5495	memcpy(tmp, buf, count);
5496	tmp[count] = '\0';
5497	if (count && tmp[count - 1] == '\n')
5498		tmp[count - 1] = '\0';
5499
5500	huge = shmem_parse_huge(tmp);
5501	if (huge == -EINVAL)
5502		return huge;
5503
5504	shmem_huge = huge;
5505	if (shmem_huge > SHMEM_HUGE_DENY)
5506		SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
5507
5508	err = start_stop_khugepaged();
5509	return err ? err : count;
5510}
5511
5512struct kobj_attribute shmem_enabled_attr = __ATTR_RW(shmem_enabled);
5513static DEFINE_SPINLOCK(huge_shmem_orders_lock);
5514
5515static ssize_t thpsize_shmem_enabled_show(struct kobject *kobj,
5516					  struct kobj_attribute *attr, char *buf)
5517{
5518	int order = to_thpsize(kobj)->order;
5519	const char *output;
5520
5521	if (test_bit(order, &huge_shmem_orders_always))
5522		output = "[always] inherit within_size advise never";
5523	else if (test_bit(order, &huge_shmem_orders_inherit))
5524		output = "always [inherit] within_size advise never";
5525	else if (test_bit(order, &huge_shmem_orders_within_size))
5526		output = "always inherit [within_size] advise never";
5527	else if (test_bit(order, &huge_shmem_orders_madvise))
5528		output = "always inherit within_size [advise] never";
5529	else
5530		output = "always inherit within_size advise [never]";
5531
5532	return sysfs_emit(buf, "%s\n", output);
5533}
5534
5535static ssize_t thpsize_shmem_enabled_store(struct kobject *kobj,
5536					   struct kobj_attribute *attr,
5537					   const char *buf, size_t count)
5538{
5539	int order = to_thpsize(kobj)->order;
5540	ssize_t ret = count;
5541
5542	if (sysfs_streq(buf, "always")) {
5543		spin_lock(&huge_shmem_orders_lock);
5544		clear_bit(order, &huge_shmem_orders_inherit);
5545		clear_bit(order, &huge_shmem_orders_madvise);
5546		clear_bit(order, &huge_shmem_orders_within_size);
5547		set_bit(order, &huge_shmem_orders_always);
5548		spin_unlock(&huge_shmem_orders_lock);
5549	} else if (sysfs_streq(buf, "inherit")) {
5550		/* Do not override huge allocation policy with non-PMD sized mTHP */
5551		if (shmem_huge == SHMEM_HUGE_FORCE &&
5552		    order != HPAGE_PMD_ORDER)
5553			return -EINVAL;
5554
5555		spin_lock(&huge_shmem_orders_lock);
5556		clear_bit(order, &huge_shmem_orders_always);
5557		clear_bit(order, &huge_shmem_orders_madvise);
5558		clear_bit(order, &huge_shmem_orders_within_size);
5559		set_bit(order, &huge_shmem_orders_inherit);
5560		spin_unlock(&huge_shmem_orders_lock);
5561	} else if (sysfs_streq(buf, "within_size")) {
5562		spin_lock(&huge_shmem_orders_lock);
5563		clear_bit(order, &huge_shmem_orders_always);
5564		clear_bit(order, &huge_shmem_orders_inherit);
5565		clear_bit(order, &huge_shmem_orders_madvise);
5566		set_bit(order, &huge_shmem_orders_within_size);
5567		spin_unlock(&huge_shmem_orders_lock);
5568	} else if (sysfs_streq(buf, "advise")) {
5569		spin_lock(&huge_shmem_orders_lock);
5570		clear_bit(order, &huge_shmem_orders_always);
5571		clear_bit(order, &huge_shmem_orders_inherit);
5572		clear_bit(order, &huge_shmem_orders_within_size);
5573		set_bit(order, &huge_shmem_orders_madvise);
5574		spin_unlock(&huge_shmem_orders_lock);
5575	} else if (sysfs_streq(buf, "never")) {
5576		spin_lock(&huge_shmem_orders_lock);
5577		clear_bit(order, &huge_shmem_orders_always);
5578		clear_bit(order, &huge_shmem_orders_inherit);
5579		clear_bit(order, &huge_shmem_orders_within_size);
5580		clear_bit(order, &huge_shmem_orders_madvise);
5581		spin_unlock(&huge_shmem_orders_lock);
5582	} else {
5583		ret = -EINVAL;
5584	}
5585
5586	if (ret > 0) {
5587		int err = start_stop_khugepaged();
5588
5589		if (err)
5590			ret = err;
5591	}
5592	return ret;
5593}
5594
5595struct kobj_attribute thpsize_shmem_enabled_attr =
5596	__ATTR(shmem_enabled, 0644, thpsize_shmem_enabled_show, thpsize_shmem_enabled_store);
5597#endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_SYSFS */
5598
5599#if defined(CONFIG_TRANSPARENT_HUGEPAGE)
5600
5601static int __init setup_transparent_hugepage_shmem(char *str)
5602{
5603	int huge;
5604
5605	huge = shmem_parse_huge(str);
5606	if (huge == -EINVAL) {
5607		pr_warn("transparent_hugepage_shmem= cannot parse, ignored\n");
5608		return huge;
5609	}
5610
5611	shmem_huge = huge;
5612	return 1;
5613}
5614__setup("transparent_hugepage_shmem=", setup_transparent_hugepage_shmem);
5615
5616static int __init setup_transparent_hugepage_tmpfs(char *str)
5617{
5618	int huge;
5619
5620	huge = shmem_parse_huge(str);
5621	if (huge < 0) {
5622		pr_warn("transparent_hugepage_tmpfs= cannot parse, ignored\n");
5623		return huge;
5624	}
5625
5626	tmpfs_huge = huge;
5627	return 1;
5628}
5629__setup("transparent_hugepage_tmpfs=", setup_transparent_hugepage_tmpfs);
5630
5631static char str_dup[PAGE_SIZE] __initdata;
5632static int __init setup_thp_shmem(char *str)
5633{
5634	char *token, *range, *policy, *subtoken;
5635	unsigned long always, inherit, madvise, within_size;
5636	char *start_size, *end_size;
5637	int start, end, nr;
5638	char *p;
5639
5640	if (!str || strlen(str) + 1 > PAGE_SIZE)
5641		goto err;
5642	strscpy(str_dup, str);
5643
5644	always = huge_shmem_orders_always;
5645	inherit = huge_shmem_orders_inherit;
5646	madvise = huge_shmem_orders_madvise;
5647	within_size = huge_shmem_orders_within_size;
5648	p = str_dup;
5649	while ((token = strsep(&p, ";")) != NULL) {
5650		range = strsep(&token, ":");
5651		policy = token;
5652
5653		if (!policy)
5654			goto err;
5655
5656		while ((subtoken = strsep(&range, ",")) != NULL) {
5657			if (strchr(subtoken, '-')) {
5658				start_size = strsep(&subtoken, "-");
5659				end_size = subtoken;
5660
5661				start = get_order_from_str(start_size,
5662							   THP_ORDERS_ALL_FILE_DEFAULT);
5663				end = get_order_from_str(end_size,
5664							 THP_ORDERS_ALL_FILE_DEFAULT);
5665			} else {
5666				start_size = end_size = subtoken;
5667				start = end = get_order_from_str(subtoken,
5668								 THP_ORDERS_ALL_FILE_DEFAULT);
5669			}
5670
5671			if (start < 0) {
5672				pr_err("invalid size %s in thp_shmem boot parameter\n",
5673				       start_size);
5674				goto err;
5675			}
5676
5677			if (end < 0) {
5678				pr_err("invalid size %s in thp_shmem boot parameter\n",
5679				       end_size);
5680				goto err;
5681			}
5682
5683			if (start > end)
5684				goto err;
5685
5686			nr = end - start + 1;
5687			if (!strcmp(policy, "always")) {
5688				bitmap_set(&always, start, nr);
5689				bitmap_clear(&inherit, start, nr);
5690				bitmap_clear(&madvise, start, nr);
5691				bitmap_clear(&within_size, start, nr);
5692			} else if (!strcmp(policy, "advise")) {
5693				bitmap_set(&madvise, start, nr);
5694				bitmap_clear(&inherit, start, nr);
5695				bitmap_clear(&always, start, nr);
5696				bitmap_clear(&within_size, start, nr);
5697			} else if (!strcmp(policy, "inherit")) {
5698				bitmap_set(&inherit, start, nr);
5699				bitmap_clear(&madvise, start, nr);
5700				bitmap_clear(&always, start, nr);
5701				bitmap_clear(&within_size, start, nr);
5702			} else if (!strcmp(policy, "within_size")) {
5703				bitmap_set(&within_size, start, nr);
5704				bitmap_clear(&inherit, start, nr);
5705				bitmap_clear(&madvise, start, nr);
5706				bitmap_clear(&always, start, nr);
5707			} else if (!strcmp(policy, "never")) {
5708				bitmap_clear(&inherit, start, nr);
5709				bitmap_clear(&madvise, start, nr);
5710				bitmap_clear(&always, start, nr);
5711				bitmap_clear(&within_size, start, nr);
5712			} else {
5713				pr_err("invalid policy %s in thp_shmem boot parameter\n", policy);
5714				goto err;
5715			}
5716		}
5717	}
5718
5719	huge_shmem_orders_always = always;
5720	huge_shmem_orders_madvise = madvise;
5721	huge_shmem_orders_inherit = inherit;
5722	huge_shmem_orders_within_size = within_size;
5723	shmem_orders_configured = true;
5724	return 1;
5725
5726err:
5727	pr_warn("thp_shmem=%s: error parsing string, ignoring setting\n", str);
5728	return 0;
5729}
5730__setup("thp_shmem=", setup_thp_shmem);
5731
5732#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
5733
5734#else /* !CONFIG_SHMEM */
5735
5736/*
5737 * tiny-shmem: simple shmemfs and tmpfs using ramfs code
5738 *
5739 * This is intended for small system where the benefits of the full
5740 * shmem code (swap-backed and resource-limited) are outweighed by
5741 * their complexity. On systems without swap this code should be
5742 * effectively equivalent, but much lighter weight.
5743 */
5744
5745static struct file_system_type shmem_fs_type = {
5746	.name		= "tmpfs",
5747	.init_fs_context = ramfs_init_fs_context,
5748	.parameters	= ramfs_fs_parameters,
5749	.kill_sb	= ramfs_kill_sb,
5750	.fs_flags	= FS_USERNS_MOUNT,
5751};
5752
5753void __init shmem_init(void)
5754{
5755	BUG_ON(register_filesystem(&shmem_fs_type) != 0);
5756
5757	shm_mnt = kern_mount(&shmem_fs_type);
5758	BUG_ON(IS_ERR(shm_mnt));
5759}
5760
5761int shmem_unuse(unsigned int type)
5762{
5763	return 0;
5764}
5765
5766int shmem_lock(struct file *file, int lock, struct ucounts *ucounts)
5767{
5768	return 0;
5769}
5770
5771void shmem_unlock_mapping(struct address_space *mapping)
5772{
5773}
5774
5775#ifdef CONFIG_MMU
5776unsigned long shmem_get_unmapped_area(struct file *file,
5777				      unsigned long addr, unsigned long len,
5778				      unsigned long pgoff, unsigned long flags)
5779{
5780	return mm_get_unmapped_area(file, addr, len, pgoff, flags);
5781}
5782#endif
5783
5784void shmem_truncate_range(struct inode *inode, loff_t lstart, uoff_t lend)
5785{
5786	truncate_inode_pages_range(inode->i_mapping, lstart, lend);
5787}
5788EXPORT_SYMBOL_GPL(shmem_truncate_range);
5789
5790#define shmem_vm_ops				generic_file_vm_ops
5791#define shmem_anon_vm_ops			generic_file_vm_ops
5792#define shmem_file_operations			ramfs_file_operations
5793
5794static inline int shmem_acct_size(unsigned long flags, loff_t size)
5795{
5796	return 0;
5797}
5798
5799static inline void shmem_unacct_size(unsigned long flags, loff_t size)
5800{
5801}
5802
5803static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap,
5804				struct super_block *sb, struct inode *dir,
5805				umode_t mode, dev_t dev, unsigned long flags)
5806{
5807	struct inode *inode = ramfs_get_inode(sb, dir, mode, dev);
5808	return inode ? inode : ERR_PTR(-ENOSPC);
5809}
5810
5811#endif /* CONFIG_SHMEM */
5812
5813/* common code */
5814
5815static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name,
5816				       loff_t size, unsigned long vm_flags,
5817				       unsigned int i_flags)
5818{
5819	unsigned long flags = (vm_flags & VM_NORESERVE) ? SHMEM_F_NORESERVE : 0;
5820	struct inode *inode;
5821	struct file *res;
5822
5823	if (IS_ERR(mnt))
5824		return ERR_CAST(mnt);
5825
5826	if (size < 0 || size > MAX_LFS_FILESIZE)
5827		return ERR_PTR(-EINVAL);
5828
5829	if (is_idmapped_mnt(mnt))
5830		return ERR_PTR(-EINVAL);
5831
5832	if (shmem_acct_size(flags, size))
5833		return ERR_PTR(-ENOMEM);
5834
5835	inode = shmem_get_inode(&nop_mnt_idmap, mnt->mnt_sb, NULL,
5836				S_IFREG | S_IRWXUGO, 0, vm_flags);
5837	if (IS_ERR(inode)) {
5838		shmem_unacct_size(flags, size);
5839		return ERR_CAST(inode);
5840	}
5841	inode->i_flags |= i_flags;
5842	inode->i_size = size;
5843	clear_nlink(inode);	/* It is unlinked */
5844	res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size));
5845	if (!IS_ERR(res))
5846		res = alloc_file_pseudo(inode, mnt, name, O_RDWR,
5847				&shmem_file_operations);
5848	if (IS_ERR(res))
5849		iput(inode);
5850	return res;
5851}
5852
5853/**
5854 * shmem_kernel_file_setup - get an unlinked file living in tmpfs which must be
5855 * 	kernel internal.  There will be NO LSM permission checks against the
5856 * 	underlying inode.  So users of this interface must do LSM checks at a
5857 *	higher layer.  The users are the big_key and shm implementations.  LSM
5858 *	checks are provided at the key or shm level rather than the inode.
5859 * @name: name for dentry (to be seen in /proc/<pid>/maps)
5860 * @size: size to be set for the file
5861 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
5862 */
5863struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags)
5864{
5865	return __shmem_file_setup(shm_mnt, name, size, flags, S_PRIVATE);
5866}
5867EXPORT_SYMBOL_GPL(shmem_kernel_file_setup);
5868
5869/**
5870 * shmem_file_setup - get an unlinked file living in tmpfs
5871 * @name: name for dentry (to be seen in /proc/<pid>/maps)
5872 * @size: size to be set for the file
5873 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
5874 */
5875struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags)
5876{
5877	return __shmem_file_setup(shm_mnt, name, size, flags, 0);
5878}
5879EXPORT_SYMBOL_GPL(shmem_file_setup);
5880
5881/**
5882 * shmem_file_setup_with_mnt - get an unlinked file living in tmpfs
5883 * @mnt: the tmpfs mount where the file will be created
5884 * @name: name for dentry (to be seen in /proc/<pid>/maps)
5885 * @size: size to be set for the file
5886 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
5887 */
5888struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, const char *name,
5889				       loff_t size, unsigned long flags)
5890{
5891	return __shmem_file_setup(mnt, name, size, flags, 0);
5892}
5893EXPORT_SYMBOL_GPL(shmem_file_setup_with_mnt);
5894
5895static struct file *__shmem_zero_setup(unsigned long start, unsigned long end, vm_flags_t vm_flags)
5896{
5897	loff_t size = end - start;
5898
5899	/*
5900	 * Cloning a new file under mmap_lock leads to a lock ordering conflict
5901	 * between XFS directory reading and selinux: since this file is only
5902	 * accessible to the user through its mapping, use S_PRIVATE flag to
5903	 * bypass file security, in the same way as shmem_kernel_file_setup().
5904	 */
5905	return shmem_kernel_file_setup("dev/zero", size, vm_flags);
5906}
5907
5908/**
5909 * shmem_zero_setup - setup a shared anonymous mapping
5910 * @vma: the vma to be mmapped is prepared by do_mmap
5911 * Returns: 0 on success, or error
5912 */
5913int shmem_zero_setup(struct vm_area_struct *vma)
5914{
5915	struct file *file = __shmem_zero_setup(vma->vm_start, vma->vm_end, vma->vm_flags);
5916
5917	if (IS_ERR(file))
5918		return PTR_ERR(file);
5919
5920	if (vma->vm_file)
5921		fput(vma->vm_file);
5922	vma->vm_file = file;
5923	vma->vm_ops = &shmem_anon_vm_ops;
5924
5925	return 0;
5926}
5927
5928/**
5929 * shmem_zero_setup_desc - same as shmem_zero_setup, but determined by VMA
5930 * descriptor for convenience.
5931 * @desc: Describes VMA
5932 * Returns: 0 on success, or error
5933 */
5934int shmem_zero_setup_desc(struct vm_area_desc *desc)
5935{
5936	struct file *file = __shmem_zero_setup(desc->start, desc->end, desc->vm_flags);
5937
5938	if (IS_ERR(file))
5939		return PTR_ERR(file);
5940
5941	desc->vm_file = file;
5942	desc->vm_ops = &shmem_anon_vm_ops;
5943
5944	return 0;
5945}
5946
5947/**
5948 * shmem_read_folio_gfp - read into page cache, using specified page allocation flags.
5949 * @mapping:	the folio's address_space
5950 * @index:	the folio index
5951 * @gfp:	the page allocator flags to use if allocating
5952 *
5953 * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)",
5954 * with any new page allocations done using the specified allocation flags.
5955 * But read_cache_page_gfp() uses the ->read_folio() method: which does not
5956 * suit tmpfs, since it may have pages in swapcache, and needs to find those
5957 * for itself; although drivers/gpu/drm i915 and ttm rely upon this support.
5958 *
5959 * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in
5960 * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily.
5961 */
5962struct folio *shmem_read_folio_gfp(struct address_space *mapping,
5963		pgoff_t index, gfp_t gfp)
5964{
5965#ifdef CONFIG_SHMEM
5966	struct inode *inode = mapping->host;
5967	struct folio *folio;
5968	int error;
5969
5970	error = shmem_get_folio_gfp(inode, index, i_size_read(inode),
5971				    &folio, SGP_CACHE, gfp, NULL, NULL);
5972	if (error)
5973		return ERR_PTR(error);
5974
5975	folio_unlock(folio);
5976	return folio;
5977#else
5978	/*
5979	 * The tiny !SHMEM case uses ramfs without swap
5980	 */
5981	return mapping_read_folio_gfp(mapping, index, gfp);
5982#endif
5983}
5984EXPORT_SYMBOL_GPL(shmem_read_folio_gfp);
5985
5986struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
5987					 pgoff_t index, gfp_t gfp)
5988{
5989	struct folio *folio = shmem_read_folio_gfp(mapping, index, gfp);
5990	struct page *page;
5991
5992	if (IS_ERR(folio))
5993		return &folio->page;
5994
5995	page = folio_file_page(folio, index);
5996	if (PageHWPoison(page)) {
5997		folio_put(folio);
5998		return ERR_PTR(-EIO);
5999	}
6000
6001	return page;
6002}
6003EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp);