mm/shmem.c at v5.12 · tjh.dev/kernel

tjh.dev / kernel
fork
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork
kernel / mm / shmem.c
at v5.12 4275 lines 113 kB view raw
wrap content
   1/*
   2 * Resizable virtual memory filesystem for Linux.
   3 *
   4 * Copyright (C) 2000 Linus Torvalds.
   5 *		 2000 Transmeta Corp.
   6 *		 2000-2001 Christoph Rohland
   7 *		 2000-2001 SAP AG
   8 *		 2002 Red Hat Inc.
   9 * Copyright (C) 2002-2011 Hugh Dickins.
  10 * Copyright (C) 2011 Google Inc.
  11 * Copyright (C) 2002-2005 VERITAS Software Corporation.
  12 * Copyright (C) 2004 Andi Kleen, SuSE Labs
  13 *
  14 * Extended attribute support for tmpfs:
  15 * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net>
  16 * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
  17 *
  18 * tiny-shmem:
  19 * Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com>
  20 *
  21 * This file is released under the GPL.
  22 */
  23
  24#include <linux/fs.h>
  25#include <linux/init.h>
  26#include <linux/vfs.h>
  27#include <linux/mount.h>
  28#include <linux/ramfs.h>
  29#include <linux/pagemap.h>
  30#include <linux/file.h>
  31#include <linux/mm.h>
  32#include <linux/random.h>
  33#include <linux/sched/signal.h>
  34#include <linux/export.h>
  35#include <linux/swap.h>
  36#include <linux/uio.h>
  37#include <linux/khugepaged.h>
  38#include <linux/hugetlb.h>
  39#include <linux/frontswap.h>
  40#include <linux/fs_parser.h>
  41
  42#include <asm/tlbflush.h> /* for arch/microblaze update_mmu_cache() */
  43
  44static struct vfsmount *shm_mnt;
  45
  46#ifdef CONFIG_SHMEM
  47/*
  48 * This virtual memory filesystem is heavily based on the ramfs. It
  49 * extends ramfs by the ability to use swap and honor resource limits
  50 * which makes it a completely usable filesystem.
  51 */
  52
  53#include <linux/xattr.h>
  54#include <linux/exportfs.h>
  55#include <linux/posix_acl.h>
  56#include <linux/posix_acl_xattr.h>
  57#include <linux/mman.h>
  58#include <linux/string.h>
  59#include <linux/slab.h>
  60#include <linux/backing-dev.h>
  61#include <linux/shmem_fs.h>
  62#include <linux/writeback.h>
  63#include <linux/blkdev.h>
  64#include <linux/pagevec.h>
  65#include <linux/percpu_counter.h>
  66#include <linux/falloc.h>
  67#include <linux/splice.h>
  68#include <linux/security.h>
  69#include <linux/swapops.h>
  70#include <linux/mempolicy.h>
  71#include <linux/namei.h>
  72#include <linux/ctype.h>
  73#include <linux/migrate.h>
  74#include <linux/highmem.h>
  75#include <linux/seq_file.h>
  76#include <linux/magic.h>
  77#include <linux/syscalls.h>
  78#include <linux/fcntl.h>
  79#include <uapi/linux/memfd.h>
  80#include <linux/userfaultfd_k.h>
  81#include <linux/rmap.h>
  82#include <linux/uuid.h>
  83
  84#include <linux/uaccess.h>
  85
  86#include "internal.h"
  87
  88#define BLOCKS_PER_PAGE  (PAGE_SIZE/512)
  89#define VM_ACCT(size)    (PAGE_ALIGN(size) >> PAGE_SHIFT)
  90
  91/* Pretend that each entry is of this size in directory's i_size */
  92#define BOGO_DIRENT_SIZE 20
  93
  94/* Symlink up to this size is kmalloc'ed instead of using a swappable page */
  95#define SHORT_SYMLINK_LEN 128
  96
  97/*
  98 * shmem_fallocate communicates with shmem_fault or shmem_writepage via
  99 * inode->i_private (with i_mutex making sure that it has only one user at
 100 * a time): we would prefer not to enlarge the shmem inode just for that.
 101 */
 102struct shmem_falloc {
 103	wait_queue_head_t *waitq; /* faults into hole wait for punch to end */
 104	pgoff_t start;		/* start of range currently being fallocated */
 105	pgoff_t next;		/* the next page offset to be fallocated */
 106	pgoff_t nr_falloced;	/* how many new pages have been fallocated */
 107	pgoff_t nr_unswapped;	/* how often writepage refused to swap out */
 108};
 109
 110struct shmem_options {
 111	unsigned long long blocks;
 112	unsigned long long inodes;
 113	struct mempolicy *mpol;
 114	kuid_t uid;
 115	kgid_t gid;
 116	umode_t mode;
 117	bool full_inums;
 118	int huge;
 119	int seen;
 120#define SHMEM_SEEN_BLOCKS 1
 121#define SHMEM_SEEN_INODES 2
 122#define SHMEM_SEEN_HUGE 4
 123#define SHMEM_SEEN_INUMS 8
 124};
 125
 126#ifdef CONFIG_TMPFS
 127static unsigned long shmem_default_max_blocks(void)
 128{
 129	return totalram_pages() / 2;
 130}
 131
 132static unsigned long shmem_default_max_inodes(void)
 133{
 134	unsigned long nr_pages = totalram_pages();
 135
 136	return min(nr_pages - totalhigh_pages(), nr_pages / 2);
 137}
 138#endif
 139
 140static bool shmem_should_replace_page(struct page *page, gfp_t gfp);
 141static int shmem_replace_page(struct page **pagep, gfp_t gfp,
 142				struct shmem_inode_info *info, pgoff_t index);
 143static int shmem_swapin_page(struct inode *inode, pgoff_t index,
 144			     struct page **pagep, enum sgp_type sgp,
 145			     gfp_t gfp, struct vm_area_struct *vma,
 146			     vm_fault_t *fault_type);
 147static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
 148		struct page **pagep, enum sgp_type sgp,
 149		gfp_t gfp, struct vm_area_struct *vma,
 150		struct vm_fault *vmf, vm_fault_t *fault_type);
 151
 152int shmem_getpage(struct inode *inode, pgoff_t index,
 153		struct page **pagep, enum sgp_type sgp)
 154{
 155	return shmem_getpage_gfp(inode, index, pagep, sgp,
 156		mapping_gfp_mask(inode->i_mapping), NULL, NULL, NULL);
 157}
 158
 159static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
 160{
 161	return sb->s_fs_info;
 162}
 163
 164/*
 165 * shmem_file_setup pre-accounts the whole fixed size of a VM object,
 166 * for shared memory and for shared anonymous (/dev/zero) mappings
 167 * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1),
 168 * consistent with the pre-accounting of private mappings ...
 169 */
 170static inline int shmem_acct_size(unsigned long flags, loff_t size)
 171{
 172	return (flags & VM_NORESERVE) ?
 173		0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size));
 174}
 175
 176static inline void shmem_unacct_size(unsigned long flags, loff_t size)
 177{
 178	if (!(flags & VM_NORESERVE))
 179		vm_unacct_memory(VM_ACCT(size));
 180}
 181
 182static inline int shmem_reacct_size(unsigned long flags,
 183		loff_t oldsize, loff_t newsize)
 184{
 185	if (!(flags & VM_NORESERVE)) {
 186		if (VM_ACCT(newsize) > VM_ACCT(oldsize))
 187			return security_vm_enough_memory_mm(current->mm,
 188					VM_ACCT(newsize) - VM_ACCT(oldsize));
 189		else if (VM_ACCT(newsize) < VM_ACCT(oldsize))
 190			vm_unacct_memory(VM_ACCT(oldsize) - VM_ACCT(newsize));
 191	}
 192	return 0;
 193}
 194
 195/*
 196 * ... whereas tmpfs objects are accounted incrementally as
 197 * pages are allocated, in order to allow large sparse files.
 198 * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM,
 199 * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
 200 */
 201static inline int shmem_acct_block(unsigned long flags, long pages)
 202{
 203	if (!(flags & VM_NORESERVE))
 204		return 0;
 205
 206	return security_vm_enough_memory_mm(current->mm,
 207			pages * VM_ACCT(PAGE_SIZE));
 208}
 209
 210static inline void shmem_unacct_blocks(unsigned long flags, long pages)
 211{
 212	if (flags & VM_NORESERVE)
 213		vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE));
 214}
 215
 216static inline bool shmem_inode_acct_block(struct inode *inode, long pages)
 217{
 218	struct shmem_inode_info *info = SHMEM_I(inode);
 219	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
 220
 221	if (shmem_acct_block(info->flags, pages))
 222		return false;
 223
 224	if (sbinfo->max_blocks) {
 225		if (percpu_counter_compare(&sbinfo->used_blocks,
 226					   sbinfo->max_blocks - pages) > 0)
 227			goto unacct;
 228		percpu_counter_add(&sbinfo->used_blocks, pages);
 229	}
 230
 231	return true;
 232
 233unacct:
 234	shmem_unacct_blocks(info->flags, pages);
 235	return false;
 236}
 237
 238static inline void shmem_inode_unacct_blocks(struct inode *inode, long pages)
 239{
 240	struct shmem_inode_info *info = SHMEM_I(inode);
 241	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
 242
 243	if (sbinfo->max_blocks)
 244		percpu_counter_sub(&sbinfo->used_blocks, pages);
 245	shmem_unacct_blocks(info->flags, pages);
 246}
 247
 248static const struct super_operations shmem_ops;
 249const struct address_space_operations shmem_aops;
 250static const struct file_operations shmem_file_operations;
 251static const struct inode_operations shmem_inode_operations;
 252static const struct inode_operations shmem_dir_inode_operations;
 253static const struct inode_operations shmem_special_inode_operations;
 254static const struct vm_operations_struct shmem_vm_ops;
 255static struct file_system_type shmem_fs_type;
 256
 257bool vma_is_shmem(struct vm_area_struct *vma)
 258{
 259	return vma->vm_ops == &shmem_vm_ops;
 260}
 261
 262static LIST_HEAD(shmem_swaplist);
 263static DEFINE_MUTEX(shmem_swaplist_mutex);
 264
 265/*
 266 * shmem_reserve_inode() performs bookkeeping to reserve a shmem inode, and
 267 * produces a novel ino for the newly allocated inode.
 268 *
 269 * It may also be called when making a hard link to permit the space needed by
 270 * each dentry. However, in that case, no new inode number is needed since that
 271 * internally draws from another pool of inode numbers (currently global
 272 * get_next_ino()). This case is indicated by passing NULL as inop.
 273 */
 274#define SHMEM_INO_BATCH 1024
 275static int shmem_reserve_inode(struct super_block *sb, ino_t *inop)
 276{
 277	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
 278	ino_t ino;
 279
 280	if (!(sb->s_flags & SB_KERNMOUNT)) {
 281		spin_lock(&sbinfo->stat_lock);
 282		if (sbinfo->max_inodes) {
 283			if (!sbinfo->free_inodes) {
 284				spin_unlock(&sbinfo->stat_lock);
 285				return -ENOSPC;
 286			}
 287			sbinfo->free_inodes--;
 288		}
 289		if (inop) {
 290			ino = sbinfo->next_ino++;
 291			if (unlikely(is_zero_ino(ino)))
 292				ino = sbinfo->next_ino++;
 293			if (unlikely(!sbinfo->full_inums &&
 294				     ino > UINT_MAX)) {
 295				/*
 296				 * Emulate get_next_ino uint wraparound for
 297				 * compatibility
 298				 */
 299				if (IS_ENABLED(CONFIG_64BIT))
 300					pr_warn("%s: inode number overflow on device %d, consider using inode64 mount option\n",
 301						__func__, MINOR(sb->s_dev));
 302				sbinfo->next_ino = 1;
 303				ino = sbinfo->next_ino++;
 304			}
 305			*inop = ino;
 306		}
 307		spin_unlock(&sbinfo->stat_lock);
 308	} else if (inop) {
 309		/*
 310		 * __shmem_file_setup, one of our callers, is lock-free: it
 311		 * doesn't hold stat_lock in shmem_reserve_inode since
 312		 * max_inodes is always 0, and is called from potentially
 313		 * unknown contexts. As such, use a per-cpu batched allocator
 314		 * which doesn't require the per-sb stat_lock unless we are at
 315		 * the batch boundary.
 316		 *
 317		 * We don't need to worry about inode{32,64} since SB_KERNMOUNT
 318		 * shmem mounts are not exposed to userspace, so we don't need
 319		 * to worry about things like glibc compatibility.
 320		 */
 321		ino_t *next_ino;
 322		next_ino = per_cpu_ptr(sbinfo->ino_batch, get_cpu());
 323		ino = *next_ino;
 324		if (unlikely(ino % SHMEM_INO_BATCH == 0)) {
 325			spin_lock(&sbinfo->stat_lock);
 326			ino = sbinfo->next_ino;
 327			sbinfo->next_ino += SHMEM_INO_BATCH;
 328			spin_unlock(&sbinfo->stat_lock);
 329			if (unlikely(is_zero_ino(ino)))
 330				ino++;
 331		}
 332		*inop = ino;
 333		*next_ino = ++ino;
 334		put_cpu();
 335	}
 336
 337	return 0;
 338}
 339
 340static void shmem_free_inode(struct super_block *sb)
 341{
 342	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
 343	if (sbinfo->max_inodes) {
 344		spin_lock(&sbinfo->stat_lock);
 345		sbinfo->free_inodes++;
 346		spin_unlock(&sbinfo->stat_lock);
 347	}
 348}
 349
 350/**
 351 * shmem_recalc_inode - recalculate the block usage of an inode
 352 * @inode: inode to recalc
 353 *
 354 * We have to calculate the free blocks since the mm can drop
 355 * undirtied hole pages behind our back.
 356 *
 357 * But normally   info->alloced == inode->i_mapping->nrpages + info->swapped
 358 * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped)
 359 *
 360 * It has to be called with the spinlock held.
 361 */
 362static void shmem_recalc_inode(struct inode *inode)
 363{
 364	struct shmem_inode_info *info = SHMEM_I(inode);
 365	long freed;
 366
 367	freed = info->alloced - info->swapped - inode->i_mapping->nrpages;
 368	if (freed > 0) {
 369		info->alloced -= freed;
 370		inode->i_blocks -= freed * BLOCKS_PER_PAGE;
 371		shmem_inode_unacct_blocks(inode, freed);
 372	}
 373}
 374
 375bool shmem_charge(struct inode *inode, long pages)
 376{
 377	struct shmem_inode_info *info = SHMEM_I(inode);
 378	unsigned long flags;
 379
 380	if (!shmem_inode_acct_block(inode, pages))
 381		return false;
 382
 383	/* nrpages adjustment first, then shmem_recalc_inode() when balanced */
 384	inode->i_mapping->nrpages += pages;
 385
 386	spin_lock_irqsave(&info->lock, flags);
 387	info->alloced += pages;
 388	inode->i_blocks += pages * BLOCKS_PER_PAGE;
 389	shmem_recalc_inode(inode);
 390	spin_unlock_irqrestore(&info->lock, flags);
 391
 392	return true;
 393}
 394
 395void shmem_uncharge(struct inode *inode, long pages)
 396{
 397	struct shmem_inode_info *info = SHMEM_I(inode);
 398	unsigned long flags;
 399
 400	/* nrpages adjustment done by __delete_from_page_cache() or caller */
 401
 402	spin_lock_irqsave(&info->lock, flags);
 403	info->alloced -= pages;
 404	inode->i_blocks -= pages * BLOCKS_PER_PAGE;
 405	shmem_recalc_inode(inode);
 406	spin_unlock_irqrestore(&info->lock, flags);
 407
 408	shmem_inode_unacct_blocks(inode, pages);
 409}
 410
 411/*
 412 * Replace item expected in xarray by a new item, while holding xa_lock.
 413 */
 414static int shmem_replace_entry(struct address_space *mapping,
 415			pgoff_t index, void *expected, void *replacement)
 416{
 417	XA_STATE(xas, &mapping->i_pages, index);
 418	void *item;
 419
 420	VM_BUG_ON(!expected);
 421	VM_BUG_ON(!replacement);
 422	item = xas_load(&xas);
 423	if (item != expected)
 424		return -ENOENT;
 425	xas_store(&xas, replacement);
 426	return 0;
 427}
 428
 429/*
 430 * Sometimes, before we decide whether to proceed or to fail, we must check
 431 * that an entry was not already brought back from swap by a racing thread.
 432 *
 433 * Checking page is not enough: by the time a SwapCache page is locked, it
 434 * might be reused, and again be SwapCache, using the same swap as before.
 435 */
 436static bool shmem_confirm_swap(struct address_space *mapping,
 437			       pgoff_t index, swp_entry_t swap)
 438{
 439	return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap);
 440}
 441
 442/*
 443 * Definitions for "huge tmpfs": tmpfs mounted with the huge= option
 444 *
 445 * SHMEM_HUGE_NEVER:
 446 *	disables huge pages for the mount;
 447 * SHMEM_HUGE_ALWAYS:
 448 *	enables huge pages for the mount;
 449 * SHMEM_HUGE_WITHIN_SIZE:
 450 *	only allocate huge pages if the page will be fully within i_size,
 451 *	also respect fadvise()/madvise() hints;
 452 * SHMEM_HUGE_ADVISE:
 453 *	only allocate huge pages if requested with fadvise()/madvise();
 454 */
 455
 456#define SHMEM_HUGE_NEVER	0
 457#define SHMEM_HUGE_ALWAYS	1
 458#define SHMEM_HUGE_WITHIN_SIZE	2
 459#define SHMEM_HUGE_ADVISE	3
 460
 461/*
 462 * Special values.
 463 * Only can be set via /sys/kernel/mm/transparent_hugepage/shmem_enabled:
 464 *
 465 * SHMEM_HUGE_DENY:
 466 *	disables huge on shm_mnt and all mounts, for emergency use;
 467 * SHMEM_HUGE_FORCE:
 468 *	enables huge on shm_mnt and all mounts, w/o needing option, for testing;
 469 *
 470 */
 471#define SHMEM_HUGE_DENY		(-1)
 472#define SHMEM_HUGE_FORCE	(-2)
 473
 474#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 475/* ifdef here to avoid bloating shmem.o when not necessary */
 476
 477static int shmem_huge __read_mostly;
 478
 479#if defined(CONFIG_SYSFS)
 480static int shmem_parse_huge(const char *str)
 481{
 482	if (!strcmp(str, "never"))
 483		return SHMEM_HUGE_NEVER;
 484	if (!strcmp(str, "always"))
 485		return SHMEM_HUGE_ALWAYS;
 486	if (!strcmp(str, "within_size"))
 487		return SHMEM_HUGE_WITHIN_SIZE;
 488	if (!strcmp(str, "advise"))
 489		return SHMEM_HUGE_ADVISE;
 490	if (!strcmp(str, "deny"))
 491		return SHMEM_HUGE_DENY;
 492	if (!strcmp(str, "force"))
 493		return SHMEM_HUGE_FORCE;
 494	return -EINVAL;
 495}
 496#endif
 497
 498#if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS)
 499static const char *shmem_format_huge(int huge)
 500{
 501	switch (huge) {
 502	case SHMEM_HUGE_NEVER:
 503		return "never";
 504	case SHMEM_HUGE_ALWAYS:
 505		return "always";
 506	case SHMEM_HUGE_WITHIN_SIZE:
 507		return "within_size";
 508	case SHMEM_HUGE_ADVISE:
 509		return "advise";
 510	case SHMEM_HUGE_DENY:
 511		return "deny";
 512	case SHMEM_HUGE_FORCE:
 513		return "force";
 514	default:
 515		VM_BUG_ON(1);
 516		return "bad_val";
 517	}
 518}
 519#endif
 520
 521static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
 522		struct shrink_control *sc, unsigned long nr_to_split)
 523{
 524	LIST_HEAD(list), *pos, *next;
 525	LIST_HEAD(to_remove);
 526	struct inode *inode;
 527	struct shmem_inode_info *info;
 528	struct page *page;
 529	unsigned long batch = sc ? sc->nr_to_scan : 128;
 530	int removed = 0, split = 0;
 531
 532	if (list_empty(&sbinfo->shrinklist))
 533		return SHRINK_STOP;
 534
 535	spin_lock(&sbinfo->shrinklist_lock);
 536	list_for_each_safe(pos, next, &sbinfo->shrinklist) {
 537		info = list_entry(pos, struct shmem_inode_info, shrinklist);
 538
 539		/* pin the inode */
 540		inode = igrab(&info->vfs_inode);
 541
 542		/* inode is about to be evicted */
 543		if (!inode) {
 544			list_del_init(&info->shrinklist);
 545			removed++;
 546			goto next;
 547		}
 548
 549		/* Check if there's anything to gain */
 550		if (round_up(inode->i_size, PAGE_SIZE) ==
 551				round_up(inode->i_size, HPAGE_PMD_SIZE)) {
 552			list_move(&info->shrinklist, &to_remove);
 553			removed++;
 554			goto next;
 555		}
 556
 557		list_move(&info->shrinklist, &list);
 558next:
 559		if (!--batch)
 560			break;
 561	}
 562	spin_unlock(&sbinfo->shrinklist_lock);
 563
 564	list_for_each_safe(pos, next, &to_remove) {
 565		info = list_entry(pos, struct shmem_inode_info, shrinklist);
 566		inode = &info->vfs_inode;
 567		list_del_init(&info->shrinklist);
 568		iput(inode);
 569	}
 570
 571	list_for_each_safe(pos, next, &list) {
 572		int ret;
 573
 574		info = list_entry(pos, struct shmem_inode_info, shrinklist);
 575		inode = &info->vfs_inode;
 576
 577		if (nr_to_split && split >= nr_to_split)
 578			goto leave;
 579
 580		page = find_get_page(inode->i_mapping,
 581				(inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT);
 582		if (!page)
 583			goto drop;
 584
 585		/* No huge page at the end of the file: nothing to split */
 586		if (!PageTransHuge(page)) {
 587			put_page(page);
 588			goto drop;
 589		}
 590
 591		/*
 592		 * Leave the inode on the list if we failed to lock
 593		 * the page at this time.
 594		 *
 595		 * Waiting for the lock may lead to deadlock in the
 596		 * reclaim path.
 597		 */
 598		if (!trylock_page(page)) {
 599			put_page(page);
 600			goto leave;
 601		}
 602
 603		ret = split_huge_page(page);
 604		unlock_page(page);
 605		put_page(page);
 606
 607		/* If split failed leave the inode on the list */
 608		if (ret)
 609			goto leave;
 610
 611		split++;
 612drop:
 613		list_del_init(&info->shrinklist);
 614		removed++;
 615leave:
 616		iput(inode);
 617	}
 618
 619	spin_lock(&sbinfo->shrinklist_lock);
 620	list_splice_tail(&list, &sbinfo->shrinklist);
 621	sbinfo->shrinklist_len -= removed;
 622	spin_unlock(&sbinfo->shrinklist_lock);
 623
 624	return split;
 625}
 626
 627static long shmem_unused_huge_scan(struct super_block *sb,
 628		struct shrink_control *sc)
 629{
 630	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
 631
 632	if (!READ_ONCE(sbinfo->shrinklist_len))
 633		return SHRINK_STOP;
 634
 635	return shmem_unused_huge_shrink(sbinfo, sc, 0);
 636}
 637
 638static long shmem_unused_huge_count(struct super_block *sb,
 639		struct shrink_control *sc)
 640{
 641	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
 642	return READ_ONCE(sbinfo->shrinklist_len);
 643}
 644#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
 645
 646#define shmem_huge SHMEM_HUGE_DENY
 647
 648static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
 649		struct shrink_control *sc, unsigned long nr_to_split)
 650{
 651	return 0;
 652}
 653#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 654
 655static inline bool is_huge_enabled(struct shmem_sb_info *sbinfo)
 656{
 657	if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
 658	    (shmem_huge == SHMEM_HUGE_FORCE || sbinfo->huge) &&
 659	    shmem_huge != SHMEM_HUGE_DENY)
 660		return true;
 661	return false;
 662}
 663
 664/*
 665 * Like add_to_page_cache_locked, but error if expected item has gone.
 666 */
 667static int shmem_add_to_page_cache(struct page *page,
 668				   struct address_space *mapping,
 669				   pgoff_t index, void *expected, gfp_t gfp,
 670				   struct mm_struct *charge_mm)
 671{
 672	XA_STATE_ORDER(xas, &mapping->i_pages, index, compound_order(page));
 673	unsigned long i = 0;
 674	unsigned long nr = compound_nr(page);
 675	int error;
 676
 677	VM_BUG_ON_PAGE(PageTail(page), page);
 678	VM_BUG_ON_PAGE(index != round_down(index, nr), page);
 679	VM_BUG_ON_PAGE(!PageLocked(page), page);
 680	VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
 681	VM_BUG_ON(expected && PageTransHuge(page));
 682
 683	page_ref_add(page, nr);
 684	page->mapping = mapping;
 685	page->index = index;
 686
 687	if (!PageSwapCache(page)) {
 688		error = mem_cgroup_charge(page, charge_mm, gfp);
 689		if (error) {
 690			if (PageTransHuge(page)) {
 691				count_vm_event(THP_FILE_FALLBACK);
 692				count_vm_event(THP_FILE_FALLBACK_CHARGE);
 693			}
 694			goto error;
 695		}
 696	}
 697	cgroup_throttle_swaprate(page, gfp);
 698
 699	do {
 700		void *entry;
 701		xas_lock_irq(&xas);
 702		entry = xas_find_conflict(&xas);
 703		if (entry != expected)
 704			xas_set_err(&xas, -EEXIST);
 705		xas_create_range(&xas);
 706		if (xas_error(&xas))
 707			goto unlock;
 708next:
 709		xas_store(&xas, page);
 710		if (++i < nr) {
 711			xas_next(&xas);
 712			goto next;
 713		}
 714		if (PageTransHuge(page)) {
 715			count_vm_event(THP_FILE_ALLOC);
 716			__mod_lruvec_page_state(page, NR_SHMEM_THPS, nr);
 717		}
 718		mapping->nrpages += nr;
 719		__mod_lruvec_page_state(page, NR_FILE_PAGES, nr);
 720		__mod_lruvec_page_state(page, NR_SHMEM, nr);
 721unlock:
 722		xas_unlock_irq(&xas);
 723	} while (xas_nomem(&xas, gfp));
 724
 725	if (xas_error(&xas)) {
 726		error = xas_error(&xas);
 727		goto error;
 728	}
 729
 730	return 0;
 731error:
 732	page->mapping = NULL;
 733	page_ref_sub(page, nr);
 734	return error;
 735}
 736
 737/*
 738 * Like delete_from_page_cache, but substitutes swap for page.
 739 */
 740static void shmem_delete_from_page_cache(struct page *page, void *radswap)
 741{
 742	struct address_space *mapping = page->mapping;
 743	int error;
 744
 745	VM_BUG_ON_PAGE(PageCompound(page), page);
 746
 747	xa_lock_irq(&mapping->i_pages);
 748	error = shmem_replace_entry(mapping, page->index, page, radswap);
 749	page->mapping = NULL;
 750	mapping->nrpages--;
 751	__dec_lruvec_page_state(page, NR_FILE_PAGES);
 752	__dec_lruvec_page_state(page, NR_SHMEM);
 753	xa_unlock_irq(&mapping->i_pages);
 754	put_page(page);
 755	BUG_ON(error);
 756}
 757
 758/*
 759 * Remove swap entry from page cache, free the swap and its page cache.
 760 */
 761static int shmem_free_swap(struct address_space *mapping,
 762			   pgoff_t index, void *radswap)
 763{
 764	void *old;
 765
 766	old = xa_cmpxchg_irq(&mapping->i_pages, index, radswap, NULL, 0);
 767	if (old != radswap)
 768		return -ENOENT;
 769	free_swap_and_cache(radix_to_swp_entry(radswap));
 770	return 0;
 771}
 772
 773/*
 774 * Determine (in bytes) how many of the shmem object's pages mapped by the
 775 * given offsets are swapped out.
 776 *
 777 * This is safe to call without i_mutex or the i_pages lock thanks to RCU,
 778 * as long as the inode doesn't go away and racy results are not a problem.
 779 */
 780unsigned long shmem_partial_swap_usage(struct address_space *mapping,
 781						pgoff_t start, pgoff_t end)
 782{
 783	XA_STATE(xas, &mapping->i_pages, start);
 784	struct page *page;
 785	unsigned long swapped = 0;
 786
 787	rcu_read_lock();
 788	xas_for_each(&xas, page, end - 1) {
 789		if (xas_retry(&xas, page))
 790			continue;
 791		if (xa_is_value(page))
 792			swapped++;
 793
 794		if (need_resched()) {
 795			xas_pause(&xas);
 796			cond_resched_rcu();
 797		}
 798	}
 799
 800	rcu_read_unlock();
 801
 802	return swapped << PAGE_SHIFT;
 803}
 804
 805/*
 806 * Determine (in bytes) how many of the shmem object's pages mapped by the
 807 * given vma is swapped out.
 808 *
 809 * This is safe to call without i_mutex or the i_pages lock thanks to RCU,
 810 * as long as the inode doesn't go away and racy results are not a problem.
 811 */
 812unsigned long shmem_swap_usage(struct vm_area_struct *vma)
 813{
 814	struct inode *inode = file_inode(vma->vm_file);
 815	struct shmem_inode_info *info = SHMEM_I(inode);
 816	struct address_space *mapping = inode->i_mapping;
 817	unsigned long swapped;
 818
 819	/* Be careful as we don't hold info->lock */
 820	swapped = READ_ONCE(info->swapped);
 821
 822	/*
 823	 * The easier cases are when the shmem object has nothing in swap, or
 824	 * the vma maps it whole. Then we can simply use the stats that we
 825	 * already track.
 826	 */
 827	if (!swapped)
 828		return 0;
 829
 830	if (!vma->vm_pgoff && vma->vm_end - vma->vm_start >= inode->i_size)
 831		return swapped << PAGE_SHIFT;
 832
 833	/* Here comes the more involved part */
 834	return shmem_partial_swap_usage(mapping,
 835			linear_page_index(vma, vma->vm_start),
 836			linear_page_index(vma, vma->vm_end));
 837}
 838
 839/*
 840 * SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists.
 841 */
 842void shmem_unlock_mapping(struct address_space *mapping)
 843{
 844	struct pagevec pvec;
 845	pgoff_t index = 0;
 846
 847	pagevec_init(&pvec);
 848	/*
 849	 * Minor point, but we might as well stop if someone else SHM_LOCKs it.
 850	 */
 851	while (!mapping_unevictable(mapping)) {
 852		if (!pagevec_lookup(&pvec, mapping, &index))
 853			break;
 854		check_move_unevictable_pages(&pvec);
 855		pagevec_release(&pvec);
 856		cond_resched();
 857	}
 858}
 859
 860/*
 861 * Check whether a hole-punch or truncation needs to split a huge page,
 862 * returning true if no split was required, or the split has been successful.
 863 *
 864 * Eviction (or truncation to 0 size) should never need to split a huge page;
 865 * but in rare cases might do so, if shmem_undo_range() failed to trylock on
 866 * head, and then succeeded to trylock on tail.
 867 *
 868 * A split can only succeed when there are no additional references on the
 869 * huge page: so the split below relies upon find_get_entries() having stopped
 870 * when it found a subpage of the huge page, without getting further references.
 871 */
 872static bool shmem_punch_compound(struct page *page, pgoff_t start, pgoff_t end)
 873{
 874	if (!PageTransCompound(page))
 875		return true;
 876
 877	/* Just proceed to delete a huge page wholly within the range punched */
 878	if (PageHead(page) &&
 879	    page->index >= start && page->index + HPAGE_PMD_NR <= end)
 880		return true;
 881
 882	/* Try to split huge page, so we can truly punch the hole or truncate */
 883	return split_huge_page(page) >= 0;
 884}
 885
 886/*
 887 * Remove range of pages and swap entries from page cache, and free them.
 888 * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
 889 */
 890static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
 891								 bool unfalloc)
 892{
 893	struct address_space *mapping = inode->i_mapping;
 894	struct shmem_inode_info *info = SHMEM_I(inode);
 895	pgoff_t start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT;
 896	pgoff_t end = (lend + 1) >> PAGE_SHIFT;
 897	unsigned int partial_start = lstart & (PAGE_SIZE - 1);
 898	unsigned int partial_end = (lend + 1) & (PAGE_SIZE - 1);
 899	struct pagevec pvec;
 900	pgoff_t indices[PAGEVEC_SIZE];
 901	long nr_swaps_freed = 0;
 902	pgoff_t index;
 903	int i;
 904
 905	if (lend == -1)
 906		end = -1;	/* unsigned, so actually very big */
 907
 908	pagevec_init(&pvec);
 909	index = start;
 910	while (index < end && find_lock_entries(mapping, index, end - 1,
 911			&pvec, indices)) {
 912		for (i = 0; i < pagevec_count(&pvec); i++) {
 913			struct page *page = pvec.pages[i];
 914
 915			index = indices[i];
 916
 917			if (xa_is_value(page)) {
 918				if (unfalloc)
 919					continue;
 920				nr_swaps_freed += !shmem_free_swap(mapping,
 921								index, page);
 922				continue;
 923			}
 924			index += thp_nr_pages(page) - 1;
 925
 926			if (!unfalloc || !PageUptodate(page))
 927				truncate_inode_page(mapping, page);
 928			unlock_page(page);
 929		}
 930		pagevec_remove_exceptionals(&pvec);
 931		pagevec_release(&pvec);
 932		cond_resched();
 933		index++;
 934	}
 935
 936	if (partial_start) {
 937		struct page *page = NULL;
 938		shmem_getpage(inode, start - 1, &page, SGP_READ);
 939		if (page) {
 940			unsigned int top = PAGE_SIZE;
 941			if (start > end) {
 942				top = partial_end;
 943				partial_end = 0;
 944			}
 945			zero_user_segment(page, partial_start, top);
 946			set_page_dirty(page);
 947			unlock_page(page);
 948			put_page(page);
 949		}
 950	}
 951	if (partial_end) {
 952		struct page *page = NULL;
 953		shmem_getpage(inode, end, &page, SGP_READ);
 954		if (page) {
 955			zero_user_segment(page, 0, partial_end);
 956			set_page_dirty(page);
 957			unlock_page(page);
 958			put_page(page);
 959		}
 960	}
 961	if (start >= end)
 962		return;
 963
 964	index = start;
 965	while (index < end) {
 966		cond_resched();
 967
 968		if (!find_get_entries(mapping, index, end - 1, &pvec,
 969				indices)) {
 970			/* If all gone or hole-punch or unfalloc, we're done */
 971			if (index == start || end != -1)
 972				break;
 973			/* But if truncating, restart to make sure all gone */
 974			index = start;
 975			continue;
 976		}
 977		for (i = 0; i < pagevec_count(&pvec); i++) {
 978			struct page *page = pvec.pages[i];
 979
 980			index = indices[i];
 981			if (xa_is_value(page)) {
 982				if (unfalloc)
 983					continue;
 984				if (shmem_free_swap(mapping, index, page)) {
 985					/* Swap was replaced by page: retry */
 986					index--;
 987					break;
 988				}
 989				nr_swaps_freed++;
 990				continue;
 991			}
 992
 993			lock_page(page);
 994
 995			if (!unfalloc || !PageUptodate(page)) {
 996				if (page_mapping(page) != mapping) {
 997					/* Page was replaced by swap: retry */
 998					unlock_page(page);
 999					index--;
1000					break;
1001				}
1002				VM_BUG_ON_PAGE(PageWriteback(page), page);
1003				if (shmem_punch_compound(page, start, end))
1004					truncate_inode_page(mapping, page);
1005				else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
1006					/* Wipe the page and don't get stuck */
1007					clear_highpage(page);
1008					flush_dcache_page(page);
1009					set_page_dirty(page);
1010					if (index <
1011					    round_up(start, HPAGE_PMD_NR))
1012						start = index + 1;
1013				}
1014			}
1015			unlock_page(page);
1016		}
1017		pagevec_remove_exceptionals(&pvec);
1018		pagevec_release(&pvec);
1019		index++;
1020	}
1021
1022	spin_lock_irq(&info->lock);
1023	info->swapped -= nr_swaps_freed;
1024	shmem_recalc_inode(inode);
1025	spin_unlock_irq(&info->lock);
1026}
1027
1028void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
1029{
1030	shmem_undo_range(inode, lstart, lend, false);
1031	inode->i_ctime = inode->i_mtime = current_time(inode);
1032}
1033EXPORT_SYMBOL_GPL(shmem_truncate_range);
1034
1035static int shmem_getattr(struct user_namespace *mnt_userns,
1036			 const struct path *path, struct kstat *stat,
1037			 u32 request_mask, unsigned int query_flags)
1038{
1039	struct inode *inode = path->dentry->d_inode;
1040	struct shmem_inode_info *info = SHMEM_I(inode);
1041	struct shmem_sb_info *sb_info = SHMEM_SB(inode->i_sb);
1042
1043	if (info->alloced - info->swapped != inode->i_mapping->nrpages) {
1044		spin_lock_irq(&info->lock);
1045		shmem_recalc_inode(inode);
1046		spin_unlock_irq(&info->lock);
1047	}
1048	generic_fillattr(&init_user_ns, inode, stat);
1049
1050	if (is_huge_enabled(sb_info))
1051		stat->blksize = HPAGE_PMD_SIZE;
1052
1053	return 0;
1054}
1055
1056static int shmem_setattr(struct user_namespace *mnt_userns,
1057			 struct dentry *dentry, struct iattr *attr)
1058{
1059	struct inode *inode = d_inode(dentry);
1060	struct shmem_inode_info *info = SHMEM_I(inode);
1061	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
1062	int error;
1063
1064	error = setattr_prepare(&init_user_ns, dentry, attr);
1065	if (error)
1066		return error;
1067
1068	if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
1069		loff_t oldsize = inode->i_size;
1070		loff_t newsize = attr->ia_size;
1071
1072		/* protected by i_mutex */
1073		if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) ||
1074		    (newsize > oldsize && (info->seals & F_SEAL_GROW)))
1075			return -EPERM;
1076
1077		if (newsize != oldsize) {
1078			error = shmem_reacct_size(SHMEM_I(inode)->flags,
1079					oldsize, newsize);
1080			if (error)
1081				return error;
1082			i_size_write(inode, newsize);
1083			inode->i_ctime = inode->i_mtime = current_time(inode);
1084		}
1085		if (newsize <= oldsize) {
1086			loff_t holebegin = round_up(newsize, PAGE_SIZE);
1087			if (oldsize > holebegin)
1088				unmap_mapping_range(inode->i_mapping,
1089							holebegin, 0, 1);
1090			if (info->alloced)
1091				shmem_truncate_range(inode,
1092							newsize, (loff_t)-1);
1093			/* unmap again to remove racily COWed private pages */
1094			if (oldsize > holebegin)
1095				unmap_mapping_range(inode->i_mapping,
1096							holebegin, 0, 1);
1097
1098			/*
1099			 * Part of the huge page can be beyond i_size: subject
1100			 * to shrink under memory pressure.
1101			 */
1102			if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
1103				spin_lock(&sbinfo->shrinklist_lock);
1104				/*
1105				 * _careful to defend against unlocked access to
1106				 * ->shrink_list in shmem_unused_huge_shrink()
1107				 */
1108				if (list_empty_careful(&info->shrinklist)) {
1109					list_add_tail(&info->shrinklist,
1110							&sbinfo->shrinklist);
1111					sbinfo->shrinklist_len++;
1112				}
1113				spin_unlock(&sbinfo->shrinklist_lock);
1114			}
1115		}
1116	}
1117
1118	setattr_copy(&init_user_ns, inode, attr);
1119	if (attr->ia_valid & ATTR_MODE)
1120		error = posix_acl_chmod(&init_user_ns, inode, inode->i_mode);
1121	return error;
1122}
1123
1124static void shmem_evict_inode(struct inode *inode)
1125{
1126	struct shmem_inode_info *info = SHMEM_I(inode);
1127	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
1128
1129	if (shmem_mapping(inode->i_mapping)) {
1130		shmem_unacct_size(info->flags, inode->i_size);
1131		inode->i_size = 0;
1132		shmem_truncate_range(inode, 0, (loff_t)-1);
1133		if (!list_empty(&info->shrinklist)) {
1134			spin_lock(&sbinfo->shrinklist_lock);
1135			if (!list_empty(&info->shrinklist)) {
1136				list_del_init(&info->shrinklist);
1137				sbinfo->shrinklist_len--;
1138			}
1139			spin_unlock(&sbinfo->shrinklist_lock);
1140		}
1141		while (!list_empty(&info->swaplist)) {
1142			/* Wait while shmem_unuse() is scanning this inode... */
1143			wait_var_event(&info->stop_eviction,
1144				       !atomic_read(&info->stop_eviction));
1145			mutex_lock(&shmem_swaplist_mutex);
1146			/* ...but beware of the race if we peeked too early */
1147			if (!atomic_read(&info->stop_eviction))
1148				list_del_init(&info->swaplist);
1149			mutex_unlock(&shmem_swaplist_mutex);
1150		}
1151	}
1152
1153	simple_xattrs_free(&info->xattrs);
1154	WARN_ON(inode->i_blocks);
1155	shmem_free_inode(inode->i_sb);
1156	clear_inode(inode);
1157}
1158
1159extern struct swap_info_struct *swap_info[];
1160
1161static int shmem_find_swap_entries(struct address_space *mapping,
1162				   pgoff_t start, unsigned int nr_entries,
1163				   struct page **entries, pgoff_t *indices,
1164				   unsigned int type, bool frontswap)
1165{
1166	XA_STATE(xas, &mapping->i_pages, start);
1167	struct page *page;
1168	swp_entry_t entry;
1169	unsigned int ret = 0;
1170
1171	if (!nr_entries)
1172		return 0;
1173
1174	rcu_read_lock();
1175	xas_for_each(&xas, page, ULONG_MAX) {
1176		if (xas_retry(&xas, page))
1177			continue;
1178
1179		if (!xa_is_value(page))
1180			continue;
1181
1182		entry = radix_to_swp_entry(page);
1183		if (swp_type(entry) != type)
1184			continue;
1185		if (frontswap &&
1186		    !frontswap_test(swap_info[type], swp_offset(entry)))
1187			continue;
1188
1189		indices[ret] = xas.xa_index;
1190		entries[ret] = page;
1191
1192		if (need_resched()) {
1193			xas_pause(&xas);
1194			cond_resched_rcu();
1195		}
1196		if (++ret == nr_entries)
1197			break;
1198	}
1199	rcu_read_unlock();
1200
1201	return ret;
1202}
1203
1204/*
1205 * Move the swapped pages for an inode to page cache. Returns the count
1206 * of pages swapped in, or the error in case of failure.
1207 */
1208static int shmem_unuse_swap_entries(struct inode *inode, struct pagevec pvec,
1209				    pgoff_t *indices)
1210{
1211	int i = 0;
1212	int ret = 0;
1213	int error = 0;
1214	struct address_space *mapping = inode->i_mapping;
1215
1216	for (i = 0; i < pvec.nr; i++) {
1217		struct page *page = pvec.pages[i];
1218
1219		if (!xa_is_value(page))
1220			continue;
1221		error = shmem_swapin_page(inode, indices[i],
1222					  &page, SGP_CACHE,
1223					  mapping_gfp_mask(mapping),
1224					  NULL, NULL);
1225		if (error == 0) {
1226			unlock_page(page);
1227			put_page(page);
1228			ret++;
1229		}
1230		if (error == -ENOMEM)
1231			break;
1232		error = 0;
1233	}
1234	return error ? error : ret;
1235}
1236
1237/*
1238 * If swap found in inode, free it and move page from swapcache to filecache.
1239 */
1240static int shmem_unuse_inode(struct inode *inode, unsigned int type,
1241			     bool frontswap, unsigned long *fs_pages_to_unuse)
1242{
1243	struct address_space *mapping = inode->i_mapping;
1244	pgoff_t start = 0;
1245	struct pagevec pvec;
1246	pgoff_t indices[PAGEVEC_SIZE];
1247	bool frontswap_partial = (frontswap && *fs_pages_to_unuse > 0);
1248	int ret = 0;
1249
1250	pagevec_init(&pvec);
1251	do {
1252		unsigned int nr_entries = PAGEVEC_SIZE;
1253
1254		if (frontswap_partial && *fs_pages_to_unuse < PAGEVEC_SIZE)
1255			nr_entries = *fs_pages_to_unuse;
1256
1257		pvec.nr = shmem_find_swap_entries(mapping, start, nr_entries,
1258						  pvec.pages, indices,
1259						  type, frontswap);
1260		if (pvec.nr == 0) {
1261			ret = 0;
1262			break;
1263		}
1264
1265		ret = shmem_unuse_swap_entries(inode, pvec, indices);
1266		if (ret < 0)
1267			break;
1268
1269		if (frontswap_partial) {
1270			*fs_pages_to_unuse -= ret;
1271			if (*fs_pages_to_unuse == 0) {
1272				ret = FRONTSWAP_PAGES_UNUSED;
1273				break;
1274			}
1275		}
1276
1277		start = indices[pvec.nr - 1];
1278	} while (true);
1279
1280	return ret;
1281}
1282
1283/*
1284 * Read all the shared memory data that resides in the swap
1285 * device 'type' back into memory, so the swap device can be
1286 * unused.
1287 */
1288int shmem_unuse(unsigned int type, bool frontswap,
1289		unsigned long *fs_pages_to_unuse)
1290{
1291	struct shmem_inode_info *info, *next;
1292	int error = 0;
1293
1294	if (list_empty(&shmem_swaplist))
1295		return 0;
1296
1297	mutex_lock(&shmem_swaplist_mutex);
1298	list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) {
1299		if (!info->swapped) {
1300			list_del_init(&info->swaplist);
1301			continue;
1302		}
1303		/*
1304		 * Drop the swaplist mutex while searching the inode for swap;
1305		 * but before doing so, make sure shmem_evict_inode() will not
1306		 * remove placeholder inode from swaplist, nor let it be freed
1307		 * (igrab() would protect from unlink, but not from unmount).
1308		 */
1309		atomic_inc(&info->stop_eviction);
1310		mutex_unlock(&shmem_swaplist_mutex);
1311
1312		error = shmem_unuse_inode(&info->vfs_inode, type, frontswap,
1313					  fs_pages_to_unuse);
1314		cond_resched();
1315
1316		mutex_lock(&shmem_swaplist_mutex);
1317		next = list_next_entry(info, swaplist);
1318		if (!info->swapped)
1319			list_del_init(&info->swaplist);
1320		if (atomic_dec_and_test(&info->stop_eviction))
1321			wake_up_var(&info->stop_eviction);
1322		if (error)
1323			break;
1324	}
1325	mutex_unlock(&shmem_swaplist_mutex);
1326
1327	return error;
1328}
1329
1330/*
1331 * Move the page from the page cache to the swap cache.
1332 */
1333static int shmem_writepage(struct page *page, struct writeback_control *wbc)
1334{
1335	struct shmem_inode_info *info;
1336	struct address_space *mapping;
1337	struct inode *inode;
1338	swp_entry_t swap;
1339	pgoff_t index;
1340
1341	VM_BUG_ON_PAGE(PageCompound(page), page);
1342	BUG_ON(!PageLocked(page));
1343	mapping = page->mapping;
1344	index = page->index;
1345	inode = mapping->host;
1346	info = SHMEM_I(inode);
1347	if (info->flags & VM_LOCKED)
1348		goto redirty;
1349	if (!total_swap_pages)
1350		goto redirty;
1351
1352	/*
1353	 * Our capabilities prevent regular writeback or sync from ever calling
1354	 * shmem_writepage; but a stacking filesystem might use ->writepage of
1355	 * its underlying filesystem, in which case tmpfs should write out to
1356	 * swap only in response to memory pressure, and not for the writeback
1357	 * threads or sync.
1358	 */
1359	if (!wbc->for_reclaim) {
1360		WARN_ON_ONCE(1);	/* Still happens? Tell us about it! */
1361		goto redirty;
1362	}
1363
1364	/*
1365	 * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC
1366	 * value into swapfile.c, the only way we can correctly account for a
1367	 * fallocated page arriving here is now to initialize it and write it.
1368	 *
1369	 * That's okay for a page already fallocated earlier, but if we have
1370	 * not yet completed the fallocation, then (a) we want to keep track
1371	 * of this page in case we have to undo it, and (b) it may not be a
1372	 * good idea to continue anyway, once we're pushing into swap.  So
1373	 * reactivate the page, and let shmem_fallocate() quit when too many.
1374	 */
1375	if (!PageUptodate(page)) {
1376		if (inode->i_private) {
1377			struct shmem_falloc *shmem_falloc;
1378			spin_lock(&inode->i_lock);
1379			shmem_falloc = inode->i_private;
1380			if (shmem_falloc &&
1381			    !shmem_falloc->waitq &&
1382			    index >= shmem_falloc->start &&
1383			    index < shmem_falloc->next)
1384				shmem_falloc->nr_unswapped++;
1385			else
1386				shmem_falloc = NULL;
1387			spin_unlock(&inode->i_lock);
1388			if (shmem_falloc)
1389				goto redirty;
1390		}
1391		clear_highpage(page);
1392		flush_dcache_page(page);
1393		SetPageUptodate(page);
1394	}
1395
1396	swap = get_swap_page(page);
1397	if (!swap.val)
1398		goto redirty;
1399
1400	/*
1401	 * Add inode to shmem_unuse()'s list of swapped-out inodes,
1402	 * if it's not already there.  Do it now before the page is
1403	 * moved to swap cache, when its pagelock no longer protects
1404	 * the inode from eviction.  But don't unlock the mutex until
1405	 * we've incremented swapped, because shmem_unuse_inode() will
1406	 * prune a !swapped inode from the swaplist under this mutex.
1407	 */
1408	mutex_lock(&shmem_swaplist_mutex);
1409	if (list_empty(&info->swaplist))
1410		list_add(&info->swaplist, &shmem_swaplist);
1411
1412	if (add_to_swap_cache(page, swap,
1413			__GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN,
1414			NULL) == 0) {
1415		spin_lock_irq(&info->lock);
1416		shmem_recalc_inode(inode);
1417		info->swapped++;
1418		spin_unlock_irq(&info->lock);
1419
1420		swap_shmem_alloc(swap);
1421		shmem_delete_from_page_cache(page, swp_to_radix_entry(swap));
1422
1423		mutex_unlock(&shmem_swaplist_mutex);
1424		BUG_ON(page_mapped(page));
1425		swap_writepage(page, wbc);
1426		return 0;
1427	}
1428
1429	mutex_unlock(&shmem_swaplist_mutex);
1430	put_swap_page(page, swap);
1431redirty:
1432	set_page_dirty(page);
1433	if (wbc->for_reclaim)
1434		return AOP_WRITEPAGE_ACTIVATE;	/* Return with page locked */
1435	unlock_page(page);
1436	return 0;
1437}
1438
1439#if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS)
1440static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
1441{
1442	char buffer[64];
1443
1444	if (!mpol || mpol->mode == MPOL_DEFAULT)
1445		return;		/* show nothing */
1446
1447	mpol_to_str(buffer, sizeof(buffer), mpol);
1448
1449	seq_printf(seq, ",mpol=%s", buffer);
1450}
1451
1452static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
1453{
1454	struct mempolicy *mpol = NULL;
1455	if (sbinfo->mpol) {
1456		spin_lock(&sbinfo->stat_lock);	/* prevent replace/use races */
1457		mpol = sbinfo->mpol;
1458		mpol_get(mpol);
1459		spin_unlock(&sbinfo->stat_lock);
1460	}
1461	return mpol;
1462}
1463#else /* !CONFIG_NUMA || !CONFIG_TMPFS */
1464static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
1465{
1466}
1467static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
1468{
1469	return NULL;
1470}
1471#endif /* CONFIG_NUMA && CONFIG_TMPFS */
1472#ifndef CONFIG_NUMA
1473#define vm_policy vm_private_data
1474#endif
1475
1476static void shmem_pseudo_vma_init(struct vm_area_struct *vma,
1477		struct shmem_inode_info *info, pgoff_t index)
1478{
1479	/* Create a pseudo vma that just contains the policy */
1480	vma_init(vma, NULL);
1481	/* Bias interleave by inode number to distribute better across nodes */
1482	vma->vm_pgoff = index + info->vfs_inode.i_ino;
1483	vma->vm_policy = mpol_shared_policy_lookup(&info->policy, index);
1484}
1485
1486static void shmem_pseudo_vma_destroy(struct vm_area_struct *vma)
1487{
1488	/* Drop reference taken by mpol_shared_policy_lookup() */
1489	mpol_cond_put(vma->vm_policy);
1490}
1491
1492static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
1493			struct shmem_inode_info *info, pgoff_t index)
1494{
1495	struct vm_area_struct pvma;
1496	struct page *page;
1497	struct vm_fault vmf = {
1498		.vma = &pvma,
1499	};
1500
1501	shmem_pseudo_vma_init(&pvma, info, index);
1502	page = swap_cluster_readahead(swap, gfp, &vmf);
1503	shmem_pseudo_vma_destroy(&pvma);
1504
1505	return page;
1506}
1507
1508/*
1509 * Make sure huge_gfp is always more limited than limit_gfp.
1510 * Some of the flags set permissions, while others set limitations.
1511 */
1512static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp)
1513{
1514	gfp_t allowflags = __GFP_IO | __GFP_FS | __GFP_RECLAIM;
1515	gfp_t denyflags = __GFP_NOWARN | __GFP_NORETRY;
1516	gfp_t zoneflags = limit_gfp & GFP_ZONEMASK;
1517	gfp_t result = huge_gfp & ~(allowflags | GFP_ZONEMASK);
1518
1519	/* Allow allocations only from the originally specified zones. */
1520	result |= zoneflags;
1521
1522	/*
1523	 * Minimize the result gfp by taking the union with the deny flags,
1524	 * and the intersection of the allow flags.
1525	 */
1526	result |= (limit_gfp & denyflags);
1527	result |= (huge_gfp & limit_gfp) & allowflags;
1528
1529	return result;
1530}
1531
1532static struct page *shmem_alloc_hugepage(gfp_t gfp,
1533		struct shmem_inode_info *info, pgoff_t index)
1534{
1535	struct vm_area_struct pvma;
1536	struct address_space *mapping = info->vfs_inode.i_mapping;
1537	pgoff_t hindex;
1538	struct page *page;
1539
1540	hindex = round_down(index, HPAGE_PMD_NR);
1541	if (xa_find(&mapping->i_pages, &hindex, hindex + HPAGE_PMD_NR - 1,
1542								XA_PRESENT))
1543		return NULL;
1544
1545	shmem_pseudo_vma_init(&pvma, info, hindex);
1546	page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(),
1547			       true);
1548	shmem_pseudo_vma_destroy(&pvma);
1549	if (page)
1550		prep_transhuge_page(page);
1551	else
1552		count_vm_event(THP_FILE_FALLBACK);
1553	return page;
1554}
1555
1556static struct page *shmem_alloc_page(gfp_t gfp,
1557			struct shmem_inode_info *info, pgoff_t index)
1558{
1559	struct vm_area_struct pvma;
1560	struct page *page;
1561
1562	shmem_pseudo_vma_init(&pvma, info, index);
1563	page = alloc_page_vma(gfp, &pvma, 0);
1564	shmem_pseudo_vma_destroy(&pvma);
1565
1566	return page;
1567}
1568
1569static struct page *shmem_alloc_and_acct_page(gfp_t gfp,
1570		struct inode *inode,
1571		pgoff_t index, bool huge)
1572{
1573	struct shmem_inode_info *info = SHMEM_I(inode);
1574	struct page *page;
1575	int nr;
1576	int err = -ENOSPC;
1577
1578	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
1579		huge = false;
1580	nr = huge ? HPAGE_PMD_NR : 1;
1581
1582	if (!shmem_inode_acct_block(inode, nr))
1583		goto failed;
1584
1585	if (huge)
1586		page = shmem_alloc_hugepage(gfp, info, index);
1587	else
1588		page = shmem_alloc_page(gfp, info, index);
1589	if (page) {
1590		__SetPageLocked(page);
1591		__SetPageSwapBacked(page);
1592		return page;
1593	}
1594
1595	err = -ENOMEM;
1596	shmem_inode_unacct_blocks(inode, nr);
1597failed:
1598	return ERR_PTR(err);
1599}
1600
1601/*
1602 * When a page is moved from swapcache to shmem filecache (either by the
1603 * usual swapin of shmem_getpage_gfp(), or by the less common swapoff of
1604 * shmem_unuse_inode()), it may have been read in earlier from swap, in
1605 * ignorance of the mapping it belongs to.  If that mapping has special
1606 * constraints (like the gma500 GEM driver, which requires RAM below 4GB),
1607 * we may need to copy to a suitable page before moving to filecache.
1608 *
1609 * In a future release, this may well be extended to respect cpuset and
1610 * NUMA mempolicy, and applied also to anonymous pages in do_swap_page();
1611 * but for now it is a simple matter of zone.
1612 */
1613static bool shmem_should_replace_page(struct page *page, gfp_t gfp)
1614{
1615	return page_zonenum(page) > gfp_zone(gfp);
1616}
1617
1618static int shmem_replace_page(struct page **pagep, gfp_t gfp,
1619				struct shmem_inode_info *info, pgoff_t index)
1620{
1621	struct page *oldpage, *newpage;
1622	struct address_space *swap_mapping;
1623	swp_entry_t entry;
1624	pgoff_t swap_index;
1625	int error;
1626
1627	oldpage = *pagep;
1628	entry.val = page_private(oldpage);
1629	swap_index = swp_offset(entry);
1630	swap_mapping = page_mapping(oldpage);
1631
1632	/*
1633	 * We have arrived here because our zones are constrained, so don't
1634	 * limit chance of success by further cpuset and node constraints.
1635	 */
1636	gfp &= ~GFP_CONSTRAINT_MASK;
1637	newpage = shmem_alloc_page(gfp, info, index);
1638	if (!newpage)
1639		return -ENOMEM;
1640
1641	get_page(newpage);
1642	copy_highpage(newpage, oldpage);
1643	flush_dcache_page(newpage);
1644
1645	__SetPageLocked(newpage);
1646	__SetPageSwapBacked(newpage);
1647	SetPageUptodate(newpage);
1648	set_page_private(newpage, entry.val);
1649	SetPageSwapCache(newpage);
1650
1651	/*
1652	 * Our caller will very soon move newpage out of swapcache, but it's
1653	 * a nice clean interface for us to replace oldpage by newpage there.
1654	 */
1655	xa_lock_irq(&swap_mapping->i_pages);
1656	error = shmem_replace_entry(swap_mapping, swap_index, oldpage, newpage);
1657	if (!error) {
1658		mem_cgroup_migrate(oldpage, newpage);
1659		__inc_lruvec_page_state(newpage, NR_FILE_PAGES);
1660		__dec_lruvec_page_state(oldpage, NR_FILE_PAGES);
1661	}
1662	xa_unlock_irq(&swap_mapping->i_pages);
1663
1664	if (unlikely(error)) {
1665		/*
1666		 * Is this possible?  I think not, now that our callers check
1667		 * both PageSwapCache and page_private after getting page lock;
1668		 * but be defensive.  Reverse old to newpage for clear and free.
1669		 */
1670		oldpage = newpage;
1671	} else {
1672		lru_cache_add(newpage);
1673		*pagep = newpage;
1674	}
1675
1676	ClearPageSwapCache(oldpage);
1677	set_page_private(oldpage, 0);
1678
1679	unlock_page(oldpage);
1680	put_page(oldpage);
1681	put_page(oldpage);
1682	return error;
1683}
1684
1685/*
1686 * Swap in the page pointed to by *pagep.
1687 * Caller has to make sure that *pagep contains a valid swapped page.
1688 * Returns 0 and the page in pagep if success. On failure, returns the
1689 * error code and NULL in *pagep.
1690 */
1691static int shmem_swapin_page(struct inode *inode, pgoff_t index,
1692			     struct page **pagep, enum sgp_type sgp,
1693			     gfp_t gfp, struct vm_area_struct *vma,
1694			     vm_fault_t *fault_type)
1695{
1696	struct address_space *mapping = inode->i_mapping;
1697	struct shmem_inode_info *info = SHMEM_I(inode);
1698	struct mm_struct *charge_mm = vma ? vma->vm_mm : current->mm;
1699	struct page *page;
1700	swp_entry_t swap;
1701	int error;
1702
1703	VM_BUG_ON(!*pagep || !xa_is_value(*pagep));
1704	swap = radix_to_swp_entry(*pagep);
1705	*pagep = NULL;
1706
1707	/* Look it up and read it in.. */
1708	page = lookup_swap_cache(swap, NULL, 0);
1709	if (!page) {
1710		/* Or update major stats only when swapin succeeds?? */
1711		if (fault_type) {
1712			*fault_type |= VM_FAULT_MAJOR;
1713			count_vm_event(PGMAJFAULT);
1714			count_memcg_event_mm(charge_mm, PGMAJFAULT);
1715		}
1716		/* Here we actually start the io */
1717		page = shmem_swapin(swap, gfp, info, index);
1718		if (!page) {
1719			error = -ENOMEM;
1720			goto failed;
1721		}
1722	}
1723
1724	/* We have to do this with page locked to prevent races */
1725	lock_page(page);
1726	if (!PageSwapCache(page) || page_private(page) != swap.val ||
1727	    !shmem_confirm_swap(mapping, index, swap)) {
1728		error = -EEXIST;
1729		goto unlock;
1730	}
1731	if (!PageUptodate(page)) {
1732		error = -EIO;
1733		goto failed;
1734	}
1735	wait_on_page_writeback(page);
1736
1737	/*
1738	 * Some architectures may have to restore extra metadata to the
1739	 * physical page after reading from swap.
1740	 */
1741	arch_swap_restore(swap, page);
1742
1743	if (shmem_should_replace_page(page, gfp)) {
1744		error = shmem_replace_page(&page, gfp, info, index);
1745		if (error)
1746			goto failed;
1747	}
1748
1749	error = shmem_add_to_page_cache(page, mapping, index,
1750					swp_to_radix_entry(swap), gfp,
1751					charge_mm);
1752	if (error)
1753		goto failed;
1754
1755	spin_lock_irq(&info->lock);
1756	info->swapped--;
1757	shmem_recalc_inode(inode);
1758	spin_unlock_irq(&info->lock);
1759
1760	if (sgp == SGP_WRITE)
1761		mark_page_accessed(page);
1762
1763	delete_from_swap_cache(page);
1764	set_page_dirty(page);
1765	swap_free(swap);
1766
1767	*pagep = page;
1768	return 0;
1769failed:
1770	if (!shmem_confirm_swap(mapping, index, swap))
1771		error = -EEXIST;
1772unlock:
1773	if (page) {
1774		unlock_page(page);
1775		put_page(page);
1776	}
1777
1778	return error;
1779}
1780
1781/*
1782 * shmem_getpage_gfp - find page in cache, or get from swap, or allocate
1783 *
1784 * If we allocate a new one we do not mark it dirty. That's up to the
1785 * vm. If we swap it in we mark it dirty since we also free the swap
1786 * entry since a page cannot live in both the swap and page cache.
1787 *
1788 * vmf and fault_type are only supplied by shmem_fault:
1789 * otherwise they are NULL.
1790 */
1791static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
1792	struct page **pagep, enum sgp_type sgp, gfp_t gfp,
1793	struct vm_area_struct *vma, struct vm_fault *vmf,
1794			vm_fault_t *fault_type)
1795{
1796	struct address_space *mapping = inode->i_mapping;
1797	struct shmem_inode_info *info = SHMEM_I(inode);
1798	struct shmem_sb_info *sbinfo;
1799	struct mm_struct *charge_mm;
1800	struct page *page;
1801	enum sgp_type sgp_huge = sgp;
1802	pgoff_t hindex = index;
1803	gfp_t huge_gfp;
1804	int error;
1805	int once = 0;
1806	int alloced = 0;
1807
1808	if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT))
1809		return -EFBIG;
1810	if (sgp == SGP_NOHUGE || sgp == SGP_HUGE)
1811		sgp = SGP_CACHE;
1812repeat:
1813	if (sgp <= SGP_CACHE &&
1814	    ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
1815		return -EINVAL;
1816	}
1817
1818	sbinfo = SHMEM_SB(inode->i_sb);
1819	charge_mm = vma ? vma->vm_mm : current->mm;
1820
1821	page = pagecache_get_page(mapping, index,
1822					FGP_ENTRY | FGP_HEAD | FGP_LOCK, 0);
1823	if (xa_is_value(page)) {
1824		error = shmem_swapin_page(inode, index, &page,
1825					  sgp, gfp, vma, fault_type);
1826		if (error == -EEXIST)
1827			goto repeat;
1828
1829		*pagep = page;
1830		return error;
1831	}
1832
1833	if (page)
1834		hindex = page->index;
1835	if (page && sgp == SGP_WRITE)
1836		mark_page_accessed(page);
1837
1838	/* fallocated page? */
1839	if (page && !PageUptodate(page)) {
1840		if (sgp != SGP_READ)
1841			goto clear;
1842		unlock_page(page);
1843		put_page(page);
1844		page = NULL;
1845		hindex = index;
1846	}
1847	if (page || sgp == SGP_READ)
1848		goto out;
1849
1850	/*
1851	 * Fast cache lookup did not find it:
1852	 * bring it back from swap or allocate.
1853	 */
1854
1855	if (vma && userfaultfd_missing(vma)) {
1856		*fault_type = handle_userfault(vmf, VM_UFFD_MISSING);
1857		return 0;
1858	}
1859
1860	/* shmem_symlink() */
1861	if (!shmem_mapping(mapping))
1862		goto alloc_nohuge;
1863	if (shmem_huge == SHMEM_HUGE_DENY || sgp_huge == SGP_NOHUGE)
1864		goto alloc_nohuge;
1865	if (shmem_huge == SHMEM_HUGE_FORCE)
1866		goto alloc_huge;
1867	switch (sbinfo->huge) {
1868	case SHMEM_HUGE_NEVER:
1869		goto alloc_nohuge;
1870	case SHMEM_HUGE_WITHIN_SIZE: {
1871		loff_t i_size;
1872		pgoff_t off;
1873
1874		off = round_up(index, HPAGE_PMD_NR);
1875		i_size = round_up(i_size_read(inode), PAGE_SIZE);
1876		if (i_size >= HPAGE_PMD_SIZE &&
1877		    i_size >> PAGE_SHIFT >= off)
1878			goto alloc_huge;
1879
1880		fallthrough;
1881	}
1882	case SHMEM_HUGE_ADVISE:
1883		if (sgp_huge == SGP_HUGE)
1884			goto alloc_huge;
1885		/* TODO: implement fadvise() hints */
1886		goto alloc_nohuge;
1887	}
1888
1889alloc_huge:
1890	huge_gfp = vma_thp_gfp_mask(vma);
1891	huge_gfp = limit_gfp_mask(huge_gfp, gfp);
1892	page = shmem_alloc_and_acct_page(huge_gfp, inode, index, true);
1893	if (IS_ERR(page)) {
1894alloc_nohuge:
1895		page = shmem_alloc_and_acct_page(gfp, inode,
1896						 index, false);
1897	}
1898	if (IS_ERR(page)) {
1899		int retry = 5;
1900
1901		error = PTR_ERR(page);
1902		page = NULL;
1903		if (error != -ENOSPC)
1904			goto unlock;
1905		/*
1906		 * Try to reclaim some space by splitting a huge page
1907		 * beyond i_size on the filesystem.
1908		 */
1909		while (retry--) {
1910			int ret;
1911
1912			ret = shmem_unused_huge_shrink(sbinfo, NULL, 1);
1913			if (ret == SHRINK_STOP)
1914				break;
1915			if (ret)
1916				goto alloc_nohuge;
1917		}
1918		goto unlock;
1919	}
1920
1921	if (PageTransHuge(page))
1922		hindex = round_down(index, HPAGE_PMD_NR);
1923	else
1924		hindex = index;
1925
1926	if (sgp == SGP_WRITE)
1927		__SetPageReferenced(page);
1928
1929	error = shmem_add_to_page_cache(page, mapping, hindex,
1930					NULL, gfp & GFP_RECLAIM_MASK,
1931					charge_mm);
1932	if (error)
1933		goto unacct;
1934	lru_cache_add(page);
1935
1936	spin_lock_irq(&info->lock);
1937	info->alloced += compound_nr(page);
1938	inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page);
1939	shmem_recalc_inode(inode);
1940	spin_unlock_irq(&info->lock);
1941	alloced = true;
1942
1943	if (PageTransHuge(page) &&
1944	    DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) <
1945			hindex + HPAGE_PMD_NR - 1) {
1946		/*
1947		 * Part of the huge page is beyond i_size: subject
1948		 * to shrink under memory pressure.
1949		 */
1950		spin_lock(&sbinfo->shrinklist_lock);
1951		/*
1952		 * _careful to defend against unlocked access to
1953		 * ->shrink_list in shmem_unused_huge_shrink()
1954		 */
1955		if (list_empty_careful(&info->shrinklist)) {
1956			list_add_tail(&info->shrinklist,
1957				      &sbinfo->shrinklist);
1958			sbinfo->shrinklist_len++;
1959		}
1960		spin_unlock(&sbinfo->shrinklist_lock);
1961	}
1962
1963	/*
1964	 * Let SGP_FALLOC use the SGP_WRITE optimization on a new page.
1965	 */
1966	if (sgp == SGP_FALLOC)
1967		sgp = SGP_WRITE;
1968clear:
1969	/*
1970	 * Let SGP_WRITE caller clear ends if write does not fill page;
1971	 * but SGP_FALLOC on a page fallocated earlier must initialize
1972	 * it now, lest undo on failure cancel our earlier guarantee.
1973	 */
1974	if (sgp != SGP_WRITE && !PageUptodate(page)) {
1975		int i;
1976
1977		for (i = 0; i < compound_nr(page); i++) {
1978			clear_highpage(page + i);
1979			flush_dcache_page(page + i);
1980		}
1981		SetPageUptodate(page);
1982	}
1983
1984	/* Perhaps the file has been truncated since we checked */
1985	if (sgp <= SGP_CACHE &&
1986	    ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
1987		if (alloced) {
1988			ClearPageDirty(page);
1989			delete_from_page_cache(page);
1990			spin_lock_irq(&info->lock);
1991			shmem_recalc_inode(inode);
1992			spin_unlock_irq(&info->lock);
1993		}
1994		error = -EINVAL;
1995		goto unlock;
1996	}
1997out:
1998	*pagep = page + index - hindex;
1999	return 0;
2000
2001	/*
2002	 * Error recovery.
2003	 */
2004unacct:
2005	shmem_inode_unacct_blocks(inode, compound_nr(page));
2006
2007	if (PageTransHuge(page)) {
2008		unlock_page(page);
2009		put_page(page);
2010		goto alloc_nohuge;
2011	}
2012unlock:
2013	if (page) {
2014		unlock_page(page);
2015		put_page(page);
2016	}
2017	if (error == -ENOSPC && !once++) {
2018		spin_lock_irq(&info->lock);
2019		shmem_recalc_inode(inode);
2020		spin_unlock_irq(&info->lock);
2021		goto repeat;
2022	}
2023	if (error == -EEXIST)
2024		goto repeat;
2025	return error;
2026}
2027
2028/*
2029 * This is like autoremove_wake_function, but it removes the wait queue
2030 * entry unconditionally - even if something else had already woken the
2031 * target.
2032 */
2033static int synchronous_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
2034{
2035	int ret = default_wake_function(wait, mode, sync, key);
2036	list_del_init(&wait->entry);
2037	return ret;
2038}
2039
2040static vm_fault_t shmem_fault(struct vm_fault *vmf)
2041{
2042	struct vm_area_struct *vma = vmf->vma;
2043	struct inode *inode = file_inode(vma->vm_file);
2044	gfp_t gfp = mapping_gfp_mask(inode->i_mapping);
2045	enum sgp_type sgp;
2046	int err;
2047	vm_fault_t ret = VM_FAULT_LOCKED;
2048
2049	/*
2050	 * Trinity finds that probing a hole which tmpfs is punching can
2051	 * prevent the hole-punch from ever completing: which in turn
2052	 * locks writers out with its hold on i_mutex.  So refrain from
2053	 * faulting pages into the hole while it's being punched.  Although
2054	 * shmem_undo_range() does remove the additions, it may be unable to
2055	 * keep up, as each new page needs its own unmap_mapping_range() call,
2056	 * and the i_mmap tree grows ever slower to scan if new vmas are added.
2057	 *
2058	 * It does not matter if we sometimes reach this check just before the
2059	 * hole-punch begins, so that one fault then races with the punch:
2060	 * we just need to make racing faults a rare case.
2061	 *
2062	 * The implementation below would be much simpler if we just used a
2063	 * standard mutex or completion: but we cannot take i_mutex in fault,
2064	 * and bloating every shmem inode for this unlikely case would be sad.
2065	 */
2066	if (unlikely(inode->i_private)) {
2067		struct shmem_falloc *shmem_falloc;
2068
2069		spin_lock(&inode->i_lock);
2070		shmem_falloc = inode->i_private;
2071		if (shmem_falloc &&
2072		    shmem_falloc->waitq &&
2073		    vmf->pgoff >= shmem_falloc->start &&
2074		    vmf->pgoff < shmem_falloc->next) {
2075			struct file *fpin;
2076			wait_queue_head_t *shmem_falloc_waitq;
2077			DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function);
2078
2079			ret = VM_FAULT_NOPAGE;
2080			fpin = maybe_unlock_mmap_for_io(vmf, NULL);
2081			if (fpin)
2082				ret = VM_FAULT_RETRY;
2083
2084			shmem_falloc_waitq = shmem_falloc->waitq;
2085			prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait,
2086					TASK_UNINTERRUPTIBLE);
2087			spin_unlock(&inode->i_lock);
2088			schedule();
2089
2090			/*
2091			 * shmem_falloc_waitq points into the shmem_fallocate()
2092			 * stack of the hole-punching task: shmem_falloc_waitq
2093			 * is usually invalid by the time we reach here, but
2094			 * finish_wait() does not dereference it in that case;
2095			 * though i_lock needed lest racing with wake_up_all().
2096			 */
2097			spin_lock(&inode->i_lock);
2098			finish_wait(shmem_falloc_waitq, &shmem_fault_wait);
2099			spin_unlock(&inode->i_lock);
2100
2101			if (fpin)
2102				fput(fpin);
2103			return ret;
2104		}
2105		spin_unlock(&inode->i_lock);
2106	}
2107
2108	sgp = SGP_CACHE;
2109
2110	if ((vma->vm_flags & VM_NOHUGEPAGE) ||
2111	    test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
2112		sgp = SGP_NOHUGE;
2113	else if (vma->vm_flags & VM_HUGEPAGE)
2114		sgp = SGP_HUGE;
2115
2116	err = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, sgp,
2117				  gfp, vma, vmf, &ret);
2118	if (err)
2119		return vmf_error(err);
2120	return ret;
2121}
2122
2123unsigned long shmem_get_unmapped_area(struct file *file,
2124				      unsigned long uaddr, unsigned long len,
2125				      unsigned long pgoff, unsigned long flags)
2126{
2127	unsigned long (*get_area)(struct file *,
2128		unsigned long, unsigned long, unsigned long, unsigned long);
2129	unsigned long addr;
2130	unsigned long offset;
2131	unsigned long inflated_len;
2132	unsigned long inflated_addr;
2133	unsigned long inflated_offset;
2134
2135	if (len > TASK_SIZE)
2136		return -ENOMEM;
2137
2138	get_area = current->mm->get_unmapped_area;
2139	addr = get_area(file, uaddr, len, pgoff, flags);
2140
2141	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
2142		return addr;
2143	if (IS_ERR_VALUE(addr))
2144		return addr;
2145	if (addr & ~PAGE_MASK)
2146		return addr;
2147	if (addr > TASK_SIZE - len)
2148		return addr;
2149
2150	if (shmem_huge == SHMEM_HUGE_DENY)
2151		return addr;
2152	if (len < HPAGE_PMD_SIZE)
2153		return addr;
2154	if (flags & MAP_FIXED)
2155		return addr;
2156	/*
2157	 * Our priority is to support MAP_SHARED mapped hugely;
2158	 * and support MAP_PRIVATE mapped hugely too, until it is COWed.
2159	 * But if caller specified an address hint and we allocated area there
2160	 * successfully, respect that as before.
2161	 */
2162	if (uaddr == addr)
2163		return addr;
2164
2165	if (shmem_huge != SHMEM_HUGE_FORCE) {
2166		struct super_block *sb;
2167
2168		if (file) {
2169			VM_BUG_ON(file->f_op != &shmem_file_operations);
2170			sb = file_inode(file)->i_sb;
2171		} else {
2172			/*
2173			 * Called directly from mm/mmap.c, or drivers/char/mem.c
2174			 * for "/dev/zero", to create a shared anonymous object.
2175			 */
2176			if (IS_ERR(shm_mnt))
2177				return addr;
2178			sb = shm_mnt->mnt_sb;
2179		}
2180		if (SHMEM_SB(sb)->huge == SHMEM_HUGE_NEVER)
2181			return addr;
2182	}
2183
2184	offset = (pgoff << PAGE_SHIFT) & (HPAGE_PMD_SIZE-1);
2185	if (offset && offset + len < 2 * HPAGE_PMD_SIZE)
2186		return addr;
2187	if ((addr & (HPAGE_PMD_SIZE-1)) == offset)
2188		return addr;
2189
2190	inflated_len = len + HPAGE_PMD_SIZE - PAGE_SIZE;
2191	if (inflated_len > TASK_SIZE)
2192		return addr;
2193	if (inflated_len < len)
2194		return addr;
2195
2196	inflated_addr = get_area(NULL, uaddr, inflated_len, 0, flags);
2197	if (IS_ERR_VALUE(inflated_addr))
2198		return addr;
2199	if (inflated_addr & ~PAGE_MASK)
2200		return addr;
2201
2202	inflated_offset = inflated_addr & (HPAGE_PMD_SIZE-1);
2203	inflated_addr += offset - inflated_offset;
2204	if (inflated_offset > offset)
2205		inflated_addr += HPAGE_PMD_SIZE;
2206
2207	if (inflated_addr > TASK_SIZE - len)
2208		return addr;
2209	return inflated_addr;
2210}
2211
2212#ifdef CONFIG_NUMA
2213static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
2214{
2215	struct inode *inode = file_inode(vma->vm_file);
2216	return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol);
2217}
2218
2219static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
2220					  unsigned long addr)
2221{
2222	struct inode *inode = file_inode(vma->vm_file);
2223	pgoff_t index;
2224
2225	index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2226	return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index);
2227}
2228#endif
2229
2230int shmem_lock(struct file *file, int lock, struct user_struct *user)
2231{
2232	struct inode *inode = file_inode(file);
2233	struct shmem_inode_info *info = SHMEM_I(inode);
2234	int retval = -ENOMEM;
2235
2236	/*
2237	 * What serializes the accesses to info->flags?
2238	 * ipc_lock_object() when called from shmctl_do_lock(),
2239	 * no serialization needed when called from shm_destroy().
2240	 */
2241	if (lock && !(info->flags & VM_LOCKED)) {
2242		if (!user_shm_lock(inode->i_size, user))
2243			goto out_nomem;
2244		info->flags |= VM_LOCKED;
2245		mapping_set_unevictable(file->f_mapping);
2246	}
2247	if (!lock && (info->flags & VM_LOCKED) && user) {
2248		user_shm_unlock(inode->i_size, user);
2249		info->flags &= ~VM_LOCKED;
2250		mapping_clear_unevictable(file->f_mapping);
2251	}
2252	retval = 0;
2253
2254out_nomem:
2255	return retval;
2256}
2257
2258static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
2259{
2260	struct shmem_inode_info *info = SHMEM_I(file_inode(file));
2261
2262	if (info->seals & F_SEAL_FUTURE_WRITE) {
2263		/*
2264		 * New PROT_WRITE and MAP_SHARED mmaps are not allowed when
2265		 * "future write" seal active.
2266		 */
2267		if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE))
2268			return -EPERM;
2269
2270		/*
2271		 * Since an F_SEAL_FUTURE_WRITE sealed memfd can be mapped as
2272		 * MAP_SHARED and read-only, take care to not allow mprotect to
2273		 * revert protections on such mappings. Do this only for shared
2274		 * mappings. For private mappings, don't need to mask
2275		 * VM_MAYWRITE as we still want them to be COW-writable.
2276		 */
2277		if (vma->vm_flags & VM_SHARED)
2278			vma->vm_flags &= ~(VM_MAYWRITE);
2279	}
2280
2281	/* arm64 - allow memory tagging on RAM-based files */
2282	vma->vm_flags |= VM_MTE_ALLOWED;
2283
2284	file_accessed(file);
2285	vma->vm_ops = &shmem_vm_ops;
2286	if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
2287			((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) <
2288			(vma->vm_end & HPAGE_PMD_MASK)) {
2289		khugepaged_enter(vma, vma->vm_flags);
2290	}
2291	return 0;
2292}
2293
2294static struct inode *shmem_get_inode(struct super_block *sb, const struct inode *dir,
2295				     umode_t mode, dev_t dev, unsigned long flags)
2296{
2297	struct inode *inode;
2298	struct shmem_inode_info *info;
2299	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
2300	ino_t ino;
2301
2302	if (shmem_reserve_inode(sb, &ino))
2303		return NULL;
2304
2305	inode = new_inode(sb);
2306	if (inode) {
2307		inode->i_ino = ino;
2308		inode_init_owner(&init_user_ns, inode, dir, mode);
2309		inode->i_blocks = 0;
2310		inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
2311		inode->i_generation = prandom_u32();
2312		info = SHMEM_I(inode);
2313		memset(info, 0, (char *)inode - (char *)info);
2314		spin_lock_init(&info->lock);
2315		atomic_set(&info->stop_eviction, 0);
2316		info->seals = F_SEAL_SEAL;
2317		info->flags = flags & VM_NORESERVE;
2318		INIT_LIST_HEAD(&info->shrinklist);
2319		INIT_LIST_HEAD(&info->swaplist);
2320		simple_xattrs_init(&info->xattrs);
2321		cache_no_acl(inode);
2322
2323		switch (mode & S_IFMT) {
2324		default:
2325			inode->i_op = &shmem_special_inode_operations;
2326			init_special_inode(inode, mode, dev);
2327			break;
2328		case S_IFREG:
2329			inode->i_mapping->a_ops = &shmem_aops;
2330			inode->i_op = &shmem_inode_operations;
2331			inode->i_fop = &shmem_file_operations;
2332			mpol_shared_policy_init(&info->policy,
2333						 shmem_get_sbmpol(sbinfo));
2334			break;
2335		case S_IFDIR:
2336			inc_nlink(inode);
2337			/* Some things misbehave if size == 0 on a directory */
2338			inode->i_size = 2 * BOGO_DIRENT_SIZE;
2339			inode->i_op = &shmem_dir_inode_operations;
2340			inode->i_fop = &simple_dir_operations;
2341			break;
2342		case S_IFLNK:
2343			/*
2344			 * Must not load anything in the rbtree,
2345			 * mpol_free_shared_policy will not be called.
2346			 */
2347			mpol_shared_policy_init(&info->policy, NULL);
2348			break;
2349		}
2350
2351		lockdep_annotate_inode_mutex_key(inode);
2352	} else
2353		shmem_free_inode(sb);
2354	return inode;
2355}
2356
2357static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
2358				  pmd_t *dst_pmd,
2359				  struct vm_area_struct *dst_vma,
2360				  unsigned long dst_addr,
2361				  unsigned long src_addr,
2362				  bool zeropage,
2363				  struct page **pagep)
2364{
2365	struct inode *inode = file_inode(dst_vma->vm_file);
2366	struct shmem_inode_info *info = SHMEM_I(inode);
2367	struct address_space *mapping = inode->i_mapping;
2368	gfp_t gfp = mapping_gfp_mask(mapping);
2369	pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
2370	spinlock_t *ptl;
2371	void *page_kaddr;
2372	struct page *page;
2373	pte_t _dst_pte, *dst_pte;
2374	int ret;
2375	pgoff_t offset, max_off;
2376
2377	ret = -ENOMEM;
2378	if (!shmem_inode_acct_block(inode, 1))
2379		goto out;
2380
2381	if (!*pagep) {
2382		page = shmem_alloc_page(gfp, info, pgoff);
2383		if (!page)
2384			goto out_unacct_blocks;
2385
2386		if (!zeropage) {	/* mcopy_atomic */
2387			page_kaddr = kmap_atomic(page);
2388			ret = copy_from_user(page_kaddr,
2389					     (const void __user *)src_addr,
2390					     PAGE_SIZE);
2391			kunmap_atomic(page_kaddr);
2392
2393			/* fallback to copy_from_user outside mmap_lock */
2394			if (unlikely(ret)) {
2395				*pagep = page;
2396				shmem_inode_unacct_blocks(inode, 1);
2397				/* don't free the page */
2398				return -ENOENT;
2399			}
2400		} else {		/* mfill_zeropage_atomic */
2401			clear_highpage(page);
2402		}
2403	} else {
2404		page = *pagep;
2405		*pagep = NULL;
2406	}
2407
2408	VM_BUG_ON(PageLocked(page) || PageSwapBacked(page));
2409	__SetPageLocked(page);
2410	__SetPageSwapBacked(page);
2411	__SetPageUptodate(page);
2412
2413	ret = -EFAULT;
2414	offset = linear_page_index(dst_vma, dst_addr);
2415	max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
2416	if (unlikely(offset >= max_off))
2417		goto out_release;
2418
2419	ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL,
2420				      gfp & GFP_RECLAIM_MASK, dst_mm);
2421	if (ret)
2422		goto out_release;
2423
2424	_dst_pte = mk_pte(page, dst_vma->vm_page_prot);
2425	if (dst_vma->vm_flags & VM_WRITE)
2426		_dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte));
2427	else {
2428		/*
2429		 * We don't set the pte dirty if the vma has no
2430		 * VM_WRITE permission, so mark the page dirty or it
2431		 * could be freed from under us. We could do it
2432		 * unconditionally before unlock_page(), but doing it
2433		 * only if VM_WRITE is not set is faster.
2434		 */
2435		set_page_dirty(page);
2436	}
2437
2438	dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
2439
2440	ret = -EFAULT;
2441	max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
2442	if (unlikely(offset >= max_off))
2443		goto out_release_unlock;
2444
2445	ret = -EEXIST;
2446	if (!pte_none(*dst_pte))
2447		goto out_release_unlock;
2448
2449	lru_cache_add(page);
2450
2451	spin_lock_irq(&info->lock);
2452	info->alloced++;
2453	inode->i_blocks += BLOCKS_PER_PAGE;
2454	shmem_recalc_inode(inode);
2455	spin_unlock_irq(&info->lock);
2456
2457	inc_mm_counter(dst_mm, mm_counter_file(page));
2458	page_add_file_rmap(page, false);
2459	set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
2460
2461	/* No need to invalidate - it was non-present before */
2462	update_mmu_cache(dst_vma, dst_addr, dst_pte);
2463	pte_unmap_unlock(dst_pte, ptl);
2464	unlock_page(page);
2465	ret = 0;
2466out:
2467	return ret;
2468out_release_unlock:
2469	pte_unmap_unlock(dst_pte, ptl);
2470	ClearPageDirty(page);
2471	delete_from_page_cache(page);
2472out_release:
2473	unlock_page(page);
2474	put_page(page);
2475out_unacct_blocks:
2476	shmem_inode_unacct_blocks(inode, 1);
2477	goto out;
2478}
2479
2480int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm,
2481			   pmd_t *dst_pmd,
2482			   struct vm_area_struct *dst_vma,
2483			   unsigned long dst_addr,
2484			   unsigned long src_addr,
2485			   struct page **pagep)
2486{
2487	return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
2488				      dst_addr, src_addr, false, pagep);
2489}
2490
2491int shmem_mfill_zeropage_pte(struct mm_struct *dst_mm,
2492			     pmd_t *dst_pmd,
2493			     struct vm_area_struct *dst_vma,
2494			     unsigned long dst_addr)
2495{
2496	struct page *page = NULL;
2497
2498	return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
2499				      dst_addr, 0, true, &page);
2500}
2501
2502#ifdef CONFIG_TMPFS
2503static const struct inode_operations shmem_symlink_inode_operations;
2504static const struct inode_operations shmem_short_symlink_operations;
2505
2506#ifdef CONFIG_TMPFS_XATTR
2507static int shmem_initxattrs(struct inode *, const struct xattr *, void *);
2508#else
2509#define shmem_initxattrs NULL
2510#endif
2511
2512static int
2513shmem_write_begin(struct file *file, struct address_space *mapping,
2514			loff_t pos, unsigned len, unsigned flags,
2515			struct page **pagep, void **fsdata)
2516{
2517	struct inode *inode = mapping->host;
2518	struct shmem_inode_info *info = SHMEM_I(inode);
2519	pgoff_t index = pos >> PAGE_SHIFT;
2520
2521	/* i_mutex is held by caller */
2522	if (unlikely(info->seals & (F_SEAL_GROW |
2523				   F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))) {
2524		if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))
2525			return -EPERM;
2526		if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size)
2527			return -EPERM;
2528	}
2529
2530	return shmem_getpage(inode, index, pagep, SGP_WRITE);
2531}
2532
2533static int
2534shmem_write_end(struct file *file, struct address_space *mapping,
2535			loff_t pos, unsigned len, unsigned copied,
2536			struct page *page, void *fsdata)
2537{
2538	struct inode *inode = mapping->host;
2539
2540	if (pos + copied > inode->i_size)
2541		i_size_write(inode, pos + copied);
2542
2543	if (!PageUptodate(page)) {
2544		struct page *head = compound_head(page);
2545		if (PageTransCompound(page)) {
2546			int i;
2547
2548			for (i = 0; i < HPAGE_PMD_NR; i++) {
2549				if (head + i == page)
2550					continue;
2551				clear_highpage(head + i);
2552				flush_dcache_page(head + i);
2553			}
2554		}
2555		if (copied < PAGE_SIZE) {
2556			unsigned from = pos & (PAGE_SIZE - 1);
2557			zero_user_segments(page, 0, from,
2558					from + copied, PAGE_SIZE);
2559		}
2560		SetPageUptodate(head);
2561	}
2562	set_page_dirty(page);
2563	unlock_page(page);
2564	put_page(page);
2565
2566	return copied;
2567}
2568
2569static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
2570{
2571	struct file *file = iocb->ki_filp;
2572	struct inode *inode = file_inode(file);
2573	struct address_space *mapping = inode->i_mapping;
2574	pgoff_t index;
2575	unsigned long offset;
2576	enum sgp_type sgp = SGP_READ;
2577	int error = 0;
2578	ssize_t retval = 0;
2579	loff_t *ppos = &iocb->ki_pos;
2580
2581	/*
2582	 * Might this read be for a stacking filesystem?  Then when reading
2583	 * holes of a sparse file, we actually need to allocate those pages,
2584	 * and even mark them dirty, so it cannot exceed the max_blocks limit.
2585	 */
2586	if (!iter_is_iovec(to))
2587		sgp = SGP_CACHE;
2588
2589	index = *ppos >> PAGE_SHIFT;
2590	offset = *ppos & ~PAGE_MASK;
2591
2592	for (;;) {
2593		struct page *page = NULL;
2594		pgoff_t end_index;
2595		unsigned long nr, ret;
2596		loff_t i_size = i_size_read(inode);
2597
2598		end_index = i_size >> PAGE_SHIFT;
2599		if (index > end_index)
2600			break;
2601		if (index == end_index) {
2602			nr = i_size & ~PAGE_MASK;
2603			if (nr <= offset)
2604				break;
2605		}
2606
2607		error = shmem_getpage(inode, index, &page, sgp);
2608		if (error) {
2609			if (error == -EINVAL)
2610				error = 0;
2611			break;
2612		}
2613		if (page) {
2614			if (sgp == SGP_CACHE)
2615				set_page_dirty(page);
2616			unlock_page(page);
2617		}
2618
2619		/*
2620		 * We must evaluate after, since reads (unlike writes)
2621		 * are called without i_mutex protection against truncate
2622		 */
2623		nr = PAGE_SIZE;
2624		i_size = i_size_read(inode);
2625		end_index = i_size >> PAGE_SHIFT;
2626		if (index == end_index) {
2627			nr = i_size & ~PAGE_MASK;
2628			if (nr <= offset) {
2629				if (page)
2630					put_page(page);
2631				break;
2632			}
2633		}
2634		nr -= offset;
2635
2636		if (page) {
2637			/*
2638			 * If users can be writing to this page using arbitrary
2639			 * virtual addresses, take care about potential aliasing
2640			 * before reading the page on the kernel side.
2641			 */
2642			if (mapping_writably_mapped(mapping))
2643				flush_dcache_page(page);
2644			/*
2645			 * Mark the page accessed if we read the beginning.
2646			 */
2647			if (!offset)
2648				mark_page_accessed(page);
2649		} else {
2650			page = ZERO_PAGE(0);
2651			get_page(page);
2652		}
2653
2654		/*
2655		 * Ok, we have the page, and it's up-to-date, so
2656		 * now we can copy it to user space...
2657		 */
2658		ret = copy_page_to_iter(page, offset, nr, to);
2659		retval += ret;
2660		offset += ret;
2661		index += offset >> PAGE_SHIFT;
2662		offset &= ~PAGE_MASK;
2663
2664		put_page(page);
2665		if (!iov_iter_count(to))
2666			break;
2667		if (ret < nr) {
2668			error = -EFAULT;
2669			break;
2670		}
2671		cond_resched();
2672	}
2673
2674	*ppos = ((loff_t) index << PAGE_SHIFT) + offset;
2675	file_accessed(file);
2676	return retval ? retval : error;
2677}
2678
2679static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
2680{
2681	struct address_space *mapping = file->f_mapping;
2682	struct inode *inode = mapping->host;
2683
2684	if (whence != SEEK_DATA && whence != SEEK_HOLE)
2685		return generic_file_llseek_size(file, offset, whence,
2686					MAX_LFS_FILESIZE, i_size_read(inode));
2687	if (offset < 0)
2688		return -ENXIO;
2689
2690	inode_lock(inode);
2691	/* We're holding i_mutex so we can access i_size directly */
2692	offset = mapping_seek_hole_data(mapping, offset, inode->i_size, whence);
2693	if (offset >= 0)
2694		offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE);
2695	inode_unlock(inode);
2696	return offset;
2697}
2698
2699static long shmem_fallocate(struct file *file, int mode, loff_t offset,
2700							 loff_t len)
2701{
2702	struct inode *inode = file_inode(file);
2703	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
2704	struct shmem_inode_info *info = SHMEM_I(inode);
2705	struct shmem_falloc shmem_falloc;
2706	pgoff_t start, index, end;
2707	int error;
2708
2709	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
2710		return -EOPNOTSUPP;
2711
2712	inode_lock(inode);
2713
2714	if (mode & FALLOC_FL_PUNCH_HOLE) {
2715		struct address_space *mapping = file->f_mapping;
2716		loff_t unmap_start = round_up(offset, PAGE_SIZE);
2717		loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1;
2718		DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq);
2719
2720		/* protected by i_mutex */
2721		if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
2722			error = -EPERM;
2723			goto out;
2724		}
2725
2726		shmem_falloc.waitq = &shmem_falloc_waitq;
2727		shmem_falloc.start = (u64)unmap_start >> PAGE_SHIFT;
2728		shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT;
2729		spin_lock(&inode->i_lock);
2730		inode->i_private = &shmem_falloc;
2731		spin_unlock(&inode->i_lock);
2732
2733		if ((u64)unmap_end > (u64)unmap_start)
2734			unmap_mapping_range(mapping, unmap_start,
2735					    1 + unmap_end - unmap_start, 0);
2736		shmem_truncate_range(inode, offset, offset + len - 1);
2737		/* No need to unmap again: hole-punching leaves COWed pages */
2738
2739		spin_lock(&inode->i_lock);
2740		inode->i_private = NULL;
2741		wake_up_all(&shmem_falloc_waitq);
2742		WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.head));
2743		spin_unlock(&inode->i_lock);
2744		error = 0;
2745		goto out;
2746	}
2747
2748	/* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
2749	error = inode_newsize_ok(inode, offset + len);
2750	if (error)
2751		goto out;
2752
2753	if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) {
2754		error = -EPERM;
2755		goto out;
2756	}
2757
2758	start = offset >> PAGE_SHIFT;
2759	end = (offset + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
2760	/* Try to avoid a swapstorm if len is impossible to satisfy */
2761	if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) {
2762		error = -ENOSPC;
2763		goto out;
2764	}
2765
2766	shmem_falloc.waitq = NULL;
2767	shmem_falloc.start = start;
2768	shmem_falloc.next  = start;
2769	shmem_falloc.nr_falloced = 0;
2770	shmem_falloc.nr_unswapped = 0;
2771	spin_lock(&inode->i_lock);
2772	inode->i_private = &shmem_falloc;
2773	spin_unlock(&inode->i_lock);
2774
2775	for (index = start; index < end; index++) {
2776		struct page *page;
2777
2778		/*
2779		 * Good, the fallocate(2) manpage permits EINTR: we may have
2780		 * been interrupted because we are using up too much memory.
2781		 */
2782		if (signal_pending(current))
2783			error = -EINTR;
2784		else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced)
2785			error = -ENOMEM;
2786		else
2787			error = shmem_getpage(inode, index, &page, SGP_FALLOC);
2788		if (error) {
2789			/* Remove the !PageUptodate pages we added */
2790			if (index > start) {
2791				shmem_undo_range(inode,
2792				    (loff_t)start << PAGE_SHIFT,
2793				    ((loff_t)index << PAGE_SHIFT) - 1, true);
2794			}
2795			goto undone;
2796		}
2797
2798		/*
2799		 * Inform shmem_writepage() how far we have reached.
2800		 * No need for lock or barrier: we have the page lock.
2801		 */
2802		shmem_falloc.next++;
2803		if (!PageUptodate(page))
2804			shmem_falloc.nr_falloced++;
2805
2806		/*
2807		 * If !PageUptodate, leave it that way so that freeable pages
2808		 * can be recognized if we need to rollback on error later.
2809		 * But set_page_dirty so that memory pressure will swap rather
2810		 * than free the pages we are allocating (and SGP_CACHE pages
2811		 * might still be clean: we now need to mark those dirty too).
2812		 */
2813		set_page_dirty(page);
2814		unlock_page(page);
2815		put_page(page);
2816		cond_resched();
2817	}
2818
2819	if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
2820		i_size_write(inode, offset + len);
2821	inode->i_ctime = current_time(inode);
2822undone:
2823	spin_lock(&inode->i_lock);
2824	inode->i_private = NULL;
2825	spin_unlock(&inode->i_lock);
2826out:
2827	inode_unlock(inode);
2828	return error;
2829}
2830
2831static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
2832{
2833	struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
2834
2835	buf->f_type = TMPFS_MAGIC;
2836	buf->f_bsize = PAGE_SIZE;
2837	buf->f_namelen = NAME_MAX;
2838	if (sbinfo->max_blocks) {
2839		buf->f_blocks = sbinfo->max_blocks;
2840		buf->f_bavail =
2841		buf->f_bfree  = sbinfo->max_blocks -
2842				percpu_counter_sum(&sbinfo->used_blocks);
2843	}
2844	if (sbinfo->max_inodes) {
2845		buf->f_files = sbinfo->max_inodes;
2846		buf->f_ffree = sbinfo->free_inodes;
2847	}
2848	/* else leave those fields 0 like simple_statfs */
2849	return 0;
2850}
2851
2852/*
2853 * File creation. Allocate an inode, and we're done..
2854 */
2855static int
2856shmem_mknod(struct user_namespace *mnt_userns, struct inode *dir,
2857	    struct dentry *dentry, umode_t mode, dev_t dev)
2858{
2859	struct inode *inode;
2860	int error = -ENOSPC;
2861
2862	inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE);
2863	if (inode) {
2864		error = simple_acl_create(dir, inode);
2865		if (error)
2866			goto out_iput;
2867		error = security_inode_init_security(inode, dir,
2868						     &dentry->d_name,
2869						     shmem_initxattrs, NULL);
2870		if (error && error != -EOPNOTSUPP)
2871			goto out_iput;
2872
2873		error = 0;
2874		dir->i_size += BOGO_DIRENT_SIZE;
2875		dir->i_ctime = dir->i_mtime = current_time(dir);
2876		d_instantiate(dentry, inode);
2877		dget(dentry); /* Extra count - pin the dentry in core */
2878	}
2879	return error;
2880out_iput:
2881	iput(inode);
2882	return error;
2883}
2884
2885static int
2886shmem_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
2887	      struct dentry *dentry, umode_t mode)
2888{
2889	struct inode *inode;
2890	int error = -ENOSPC;
2891
2892	inode = shmem_get_inode(dir->i_sb, dir, mode, 0, VM_NORESERVE);
2893	if (inode) {
2894		error = security_inode_init_security(inode, dir,
2895						     NULL,
2896						     shmem_initxattrs, NULL);
2897		if (error && error != -EOPNOTSUPP)
2898			goto out_iput;
2899		error = simple_acl_create(dir, inode);
2900		if (error)
2901			goto out_iput;
2902		d_tmpfile(dentry, inode);
2903	}
2904	return error;
2905out_iput:
2906	iput(inode);
2907	return error;
2908}
2909
2910static int shmem_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
2911		       struct dentry *dentry, umode_t mode)
2912{
2913	int error;
2914
2915	if ((error = shmem_mknod(&init_user_ns, dir, dentry,
2916				 mode | S_IFDIR, 0)))
2917		return error;
2918	inc_nlink(dir);
2919	return 0;
2920}
2921
2922static int shmem_create(struct user_namespace *mnt_userns, struct inode *dir,
2923			struct dentry *dentry, umode_t mode, bool excl)
2924{
2925	return shmem_mknod(&init_user_ns, dir, dentry, mode | S_IFREG, 0);
2926}
2927
2928/*
2929 * Link a file..
2930 */
2931static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
2932{
2933	struct inode *inode = d_inode(old_dentry);
2934	int ret = 0;
2935
2936	/*
2937	 * No ordinary (disk based) filesystem counts links as inodes;
2938	 * but each new link needs a new dentry, pinning lowmem, and
2939	 * tmpfs dentries cannot be pruned until they are unlinked.
2940	 * But if an O_TMPFILE file is linked into the tmpfs, the
2941	 * first link must skip that, to get the accounting right.
2942	 */
2943	if (inode->i_nlink) {
2944		ret = shmem_reserve_inode(inode->i_sb, NULL);
2945		if (ret)
2946			goto out;
2947	}
2948
2949	dir->i_size += BOGO_DIRENT_SIZE;
2950	inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
2951	inc_nlink(inode);
2952	ihold(inode);	/* New dentry reference */
2953	dget(dentry);		/* Extra pinning count for the created dentry */
2954	d_instantiate(dentry, inode);
2955out:
2956	return ret;
2957}
2958
2959static int shmem_unlink(struct inode *dir, struct dentry *dentry)
2960{
2961	struct inode *inode = d_inode(dentry);
2962
2963	if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode))
2964		shmem_free_inode(inode->i_sb);
2965
2966	dir->i_size -= BOGO_DIRENT_SIZE;
2967	inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
2968	drop_nlink(inode);
2969	dput(dentry);	/* Undo the count from "create" - this does all the work */
2970	return 0;
2971}
2972
2973static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
2974{
2975	if (!simple_empty(dentry))
2976		return -ENOTEMPTY;
2977
2978	drop_nlink(d_inode(dentry));
2979	drop_nlink(dir);
2980	return shmem_unlink(dir, dentry);
2981}
2982
2983static int shmem_exchange(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry)
2984{
2985	bool old_is_dir = d_is_dir(old_dentry);
2986	bool new_is_dir = d_is_dir(new_dentry);
2987
2988	if (old_dir != new_dir && old_is_dir != new_is_dir) {
2989		if (old_is_dir) {
2990			drop_nlink(old_dir);
2991			inc_nlink(new_dir);
2992		} else {
2993			drop_nlink(new_dir);
2994			inc_nlink(old_dir);
2995		}
2996	}
2997	old_dir->i_ctime = old_dir->i_mtime =
2998	new_dir->i_ctime = new_dir->i_mtime =
2999	d_inode(old_dentry)->i_ctime =
3000	d_inode(new_dentry)->i_ctime = current_time(old_dir);
3001
3002	return 0;
3003}
3004
3005static int shmem_whiteout(struct user_namespace *mnt_userns,
3006			  struct inode *old_dir, struct dentry *old_dentry)
3007{
3008	struct dentry *whiteout;
3009	int error;
3010
3011	whiteout = d_alloc(old_dentry->d_parent, &old_dentry->d_name);
3012	if (!whiteout)
3013		return -ENOMEM;
3014
3015	error = shmem_mknod(&init_user_ns, old_dir, whiteout,
3016			    S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV);
3017	dput(whiteout);
3018	if (error)
3019		return error;
3020
3021	/*
3022	 * Cheat and hash the whiteout while the old dentry is still in
3023	 * place, instead of playing games with FS_RENAME_DOES_D_MOVE.
3024	 *
3025	 * d_lookup() will consistently find one of them at this point,
3026	 * not sure which one, but that isn't even important.
3027	 */
3028	d_rehash(whiteout);
3029	return 0;
3030}
3031
3032/*
3033 * The VFS layer already does all the dentry stuff for rename,
3034 * we just have to decrement the usage count for the target if
3035 * it exists so that the VFS layer correctly free's it when it
3036 * gets overwritten.
3037 */
3038static int shmem_rename2(struct user_namespace *mnt_userns,
3039			 struct inode *old_dir, struct dentry *old_dentry,
3040			 struct inode *new_dir, struct dentry *new_dentry,
3041			 unsigned int flags)
3042{
3043	struct inode *inode = d_inode(old_dentry);
3044	int they_are_dirs = S_ISDIR(inode->i_mode);
3045
3046	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
3047		return -EINVAL;
3048
3049	if (flags & RENAME_EXCHANGE)
3050		return shmem_exchange(old_dir, old_dentry, new_dir, new_dentry);
3051
3052	if (!simple_empty(new_dentry))
3053		return -ENOTEMPTY;
3054
3055	if (flags & RENAME_WHITEOUT) {
3056		int error;
3057
3058		error = shmem_whiteout(&init_user_ns, old_dir, old_dentry);
3059		if (error)
3060			return error;
3061	}
3062
3063	if (d_really_is_positive(new_dentry)) {
3064		(void) shmem_unlink(new_dir, new_dentry);
3065		if (they_are_dirs) {
3066			drop_nlink(d_inode(new_dentry));
3067			drop_nlink(old_dir);
3068		}
3069	} else if (they_are_dirs) {
3070		drop_nlink(old_dir);
3071		inc_nlink(new_dir);
3072	}
3073
3074	old_dir->i_size -= BOGO_DIRENT_SIZE;
3075	new_dir->i_size += BOGO_DIRENT_SIZE;
3076	old_dir->i_ctime = old_dir->i_mtime =
3077	new_dir->i_ctime = new_dir->i_mtime =
3078	inode->i_ctime = current_time(old_dir);
3079	return 0;
3080}
3081
3082static int shmem_symlink(struct user_namespace *mnt_userns, struct inode *dir,
3083			 struct dentry *dentry, const char *symname)
3084{
3085	int error;
3086	int len;
3087	struct inode *inode;
3088	struct page *page;
3089
3090	len = strlen(symname) + 1;
3091	if (len > PAGE_SIZE)
3092		return -ENAMETOOLONG;
3093
3094	inode = shmem_get_inode(dir->i_sb, dir, S_IFLNK | 0777, 0,
3095				VM_NORESERVE);
3096	if (!inode)
3097		return -ENOSPC;
3098
3099	error = security_inode_init_security(inode, dir, &dentry->d_name,
3100					     shmem_initxattrs, NULL);
3101	if (error && error != -EOPNOTSUPP) {
3102		iput(inode);
3103		return error;
3104	}
3105
3106	inode->i_size = len-1;
3107	if (len <= SHORT_SYMLINK_LEN) {
3108		inode->i_link = kmemdup(symname, len, GFP_KERNEL);
3109		if (!inode->i_link) {
3110			iput(inode);
3111			return -ENOMEM;
3112		}
3113		inode->i_op = &shmem_short_symlink_operations;
3114	} else {
3115		inode_nohighmem(inode);
3116		error = shmem_getpage(inode, 0, &page, SGP_WRITE);
3117		if (error) {
3118			iput(inode);
3119			return error;
3120		}
3121		inode->i_mapping->a_ops = &shmem_aops;
3122		inode->i_op = &shmem_symlink_inode_operations;
3123		memcpy(page_address(page), symname, len);
3124		SetPageUptodate(page);
3125		set_page_dirty(page);
3126		unlock_page(page);
3127		put_page(page);
3128	}
3129	dir->i_size += BOGO_DIRENT_SIZE;
3130	dir->i_ctime = dir->i_mtime = current_time(dir);
3131	d_instantiate(dentry, inode);
3132	dget(dentry);
3133	return 0;
3134}
3135
3136static void shmem_put_link(void *arg)
3137{
3138	mark_page_accessed(arg);
3139	put_page(arg);
3140}
3141
3142static const char *shmem_get_link(struct dentry *dentry,
3143				  struct inode *inode,
3144				  struct delayed_call *done)
3145{
3146	struct page *page = NULL;
3147	int error;
3148	if (!dentry) {
3149		page = find_get_page(inode->i_mapping, 0);
3150		if (!page)
3151			return ERR_PTR(-ECHILD);
3152		if (!PageUptodate(page)) {
3153			put_page(page);
3154			return ERR_PTR(-ECHILD);
3155		}
3156	} else {
3157		error = shmem_getpage(inode, 0, &page, SGP_READ);
3158		if (error)
3159			return ERR_PTR(error);
3160		unlock_page(page);
3161	}
3162	set_delayed_call(done, shmem_put_link, page);
3163	return page_address(page);
3164}
3165
3166#ifdef CONFIG_TMPFS_XATTR
3167/*
3168 * Superblocks without xattr inode operations may get some security.* xattr
3169 * support from the LSM "for free". As soon as we have any other xattrs
3170 * like ACLs, we also need to implement the security.* handlers at
3171 * filesystem level, though.
3172 */
3173
3174/*
3175 * Callback for security_inode_init_security() for acquiring xattrs.
3176 */
3177static int shmem_initxattrs(struct inode *inode,
3178			    const struct xattr *xattr_array,
3179			    void *fs_info)
3180{
3181	struct shmem_inode_info *info = SHMEM_I(inode);
3182	const struct xattr *xattr;
3183	struct simple_xattr *new_xattr;
3184	size_t len;
3185
3186	for (xattr = xattr_array; xattr->name != NULL; xattr++) {
3187		new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len);
3188		if (!new_xattr)
3189			return -ENOMEM;
3190
3191		len = strlen(xattr->name) + 1;
3192		new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len,
3193					  GFP_KERNEL);
3194		if (!new_xattr->name) {
3195			kvfree(new_xattr);
3196			return -ENOMEM;
3197		}
3198
3199		memcpy(new_xattr->name, XATTR_SECURITY_PREFIX,
3200		       XATTR_SECURITY_PREFIX_LEN);
3201		memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN,
3202		       xattr->name, len);
3203
3204		simple_xattr_list_add(&info->xattrs, new_xattr);
3205	}
3206
3207	return 0;
3208}
3209
3210static int shmem_xattr_handler_get(const struct xattr_handler *handler,
3211				   struct dentry *unused, struct inode *inode,
3212				   const char *name, void *buffer, size_t size)
3213{
3214	struct shmem_inode_info *info = SHMEM_I(inode);
3215
3216	name = xattr_full_name(handler, name);
3217	return simple_xattr_get(&info->xattrs, name, buffer, size);
3218}
3219
3220static int shmem_xattr_handler_set(const struct xattr_handler *handler,
3221				   struct user_namespace *mnt_userns,
3222				   struct dentry *unused, struct inode *inode,
3223				   const char *name, const void *value,
3224				   size_t size, int flags)
3225{
3226	struct shmem_inode_info *info = SHMEM_I(inode);
3227
3228	name = xattr_full_name(handler, name);
3229	return simple_xattr_set(&info->xattrs, name, value, size, flags, NULL);
3230}
3231
3232static const struct xattr_handler shmem_security_xattr_handler = {
3233	.prefix = XATTR_SECURITY_PREFIX,
3234	.get = shmem_xattr_handler_get,
3235	.set = shmem_xattr_handler_set,
3236};
3237
3238static const struct xattr_handler shmem_trusted_xattr_handler = {
3239	.prefix = XATTR_TRUSTED_PREFIX,
3240	.get = shmem_xattr_handler_get,
3241	.set = shmem_xattr_handler_set,
3242};
3243
3244static const struct xattr_handler *shmem_xattr_handlers[] = {
3245#ifdef CONFIG_TMPFS_POSIX_ACL
3246	&posix_acl_access_xattr_handler,
3247	&posix_acl_default_xattr_handler,
3248#endif
3249	&shmem_security_xattr_handler,
3250	&shmem_trusted_xattr_handler,
3251	NULL
3252};
3253
3254static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size)
3255{
3256	struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
3257	return simple_xattr_list(d_inode(dentry), &info->xattrs, buffer, size);
3258}
3259#endif /* CONFIG_TMPFS_XATTR */
3260
3261static const struct inode_operations shmem_short_symlink_operations = {
3262	.get_link	= simple_get_link,
3263#ifdef CONFIG_TMPFS_XATTR
3264	.listxattr	= shmem_listxattr,
3265#endif
3266};
3267
3268static const struct inode_operations shmem_symlink_inode_operations = {
3269	.get_link	= shmem_get_link,
3270#ifdef CONFIG_TMPFS_XATTR
3271	.listxattr	= shmem_listxattr,
3272#endif
3273};
3274
3275static struct dentry *shmem_get_parent(struct dentry *child)
3276{
3277	return ERR_PTR(-ESTALE);
3278}
3279
3280static int shmem_match(struct inode *ino, void *vfh)
3281{
3282	__u32 *fh = vfh;
3283	__u64 inum = fh[2];
3284	inum = (inum << 32) | fh[1];
3285	return ino->i_ino == inum && fh[0] == ino->i_generation;
3286}
3287
3288/* Find any alias of inode, but prefer a hashed alias */
3289static struct dentry *shmem_find_alias(struct inode *inode)
3290{
3291	struct dentry *alias = d_find_alias(inode);
3292
3293	return alias ?: d_find_any_alias(inode);
3294}
3295
3296
3297static struct dentry *shmem_fh_to_dentry(struct super_block *sb,
3298		struct fid *fid, int fh_len, int fh_type)
3299{
3300	struct inode *inode;
3301	struct dentry *dentry = NULL;
3302	u64 inum;
3303
3304	if (fh_len < 3)
3305		return NULL;
3306
3307	inum = fid->raw[2];
3308	inum = (inum << 32) | fid->raw[1];
3309
3310	inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]),
3311			shmem_match, fid->raw);
3312	if (inode) {
3313		dentry = shmem_find_alias(inode);
3314		iput(inode);
3315	}
3316
3317	return dentry;
3318}
3319
3320static int shmem_encode_fh(struct inode *inode, __u32 *fh, int *len,
3321				struct inode *parent)
3322{
3323	if (*len < 3) {
3324		*len = 3;
3325		return FILEID_INVALID;
3326	}
3327
3328	if (inode_unhashed(inode)) {
3329		/* Unfortunately insert_inode_hash is not idempotent,
3330		 * so as we hash inodes here rather than at creation
3331		 * time, we need a lock to ensure we only try
3332		 * to do it once
3333		 */
3334		static DEFINE_SPINLOCK(lock);
3335		spin_lock(&lock);
3336		if (inode_unhashed(inode))
3337			__insert_inode_hash(inode,
3338					    inode->i_ino + inode->i_generation);
3339		spin_unlock(&lock);
3340	}
3341
3342	fh[0] = inode->i_generation;
3343	fh[1] = inode->i_ino;
3344	fh[2] = ((__u64)inode->i_ino) >> 32;
3345
3346	*len = 3;
3347	return 1;
3348}
3349
3350static const struct export_operations shmem_export_ops = {
3351	.get_parent     = shmem_get_parent,
3352	.encode_fh      = shmem_encode_fh,
3353	.fh_to_dentry	= shmem_fh_to_dentry,
3354};
3355
3356enum shmem_param {
3357	Opt_gid,
3358	Opt_huge,
3359	Opt_mode,
3360	Opt_mpol,
3361	Opt_nr_blocks,
3362	Opt_nr_inodes,
3363	Opt_size,
3364	Opt_uid,
3365	Opt_inode32,
3366	Opt_inode64,
3367};
3368
3369static const struct constant_table shmem_param_enums_huge[] = {
3370	{"never",	SHMEM_HUGE_NEVER },
3371	{"always",	SHMEM_HUGE_ALWAYS },
3372	{"within_size",	SHMEM_HUGE_WITHIN_SIZE },
3373	{"advise",	SHMEM_HUGE_ADVISE },
3374	{}
3375};
3376
3377const struct fs_parameter_spec shmem_fs_parameters[] = {
3378	fsparam_u32   ("gid",		Opt_gid),
3379	fsparam_enum  ("huge",		Opt_huge,  shmem_param_enums_huge),
3380	fsparam_u32oct("mode",		Opt_mode),
3381	fsparam_string("mpol",		Opt_mpol),
3382	fsparam_string("nr_blocks",	Opt_nr_blocks),
3383	fsparam_string("nr_inodes",	Opt_nr_inodes),
3384	fsparam_string("size",		Opt_size),
3385	fsparam_u32   ("uid",		Opt_uid),
3386	fsparam_flag  ("inode32",	Opt_inode32),
3387	fsparam_flag  ("inode64",	Opt_inode64),
3388	{}
3389};
3390
3391static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
3392{
3393	struct shmem_options *ctx = fc->fs_private;
3394	struct fs_parse_result result;
3395	unsigned long long size;
3396	char *rest;
3397	int opt;
3398
3399	opt = fs_parse(fc, shmem_fs_parameters, param, &result);
3400	if (opt < 0)
3401		return opt;
3402
3403	switch (opt) {
3404	case Opt_size:
3405		size = memparse(param->string, &rest);
3406		if (*rest == '%') {
3407			size <<= PAGE_SHIFT;
3408			size *= totalram_pages();
3409			do_div(size, 100);
3410			rest++;
3411		}
3412		if (*rest)
3413			goto bad_value;
3414		ctx->blocks = DIV_ROUND_UP(size, PAGE_SIZE);
3415		ctx->seen |= SHMEM_SEEN_BLOCKS;
3416		break;
3417	case Opt_nr_blocks:
3418		ctx->blocks = memparse(param->string, &rest);
3419		if (*rest)
3420			goto bad_value;
3421		ctx->seen |= SHMEM_SEEN_BLOCKS;
3422		break;
3423	case Opt_nr_inodes:
3424		ctx->inodes = memparse(param->string, &rest);
3425		if (*rest)
3426			goto bad_value;
3427		ctx->seen |= SHMEM_SEEN_INODES;
3428		break;
3429	case Opt_mode:
3430		ctx->mode = result.uint_32 & 07777;
3431		break;
3432	case Opt_uid:
3433		ctx->uid = make_kuid(current_user_ns(), result.uint_32);
3434		if (!uid_valid(ctx->uid))
3435			goto bad_value;
3436		break;
3437	case Opt_gid:
3438		ctx->gid = make_kgid(current_user_ns(), result.uint_32);
3439		if (!gid_valid(ctx->gid))
3440			goto bad_value;
3441		break;
3442	case Opt_huge:
3443		ctx->huge = result.uint_32;
3444		if (ctx->huge != SHMEM_HUGE_NEVER &&
3445		    !(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
3446		      has_transparent_hugepage()))
3447			goto unsupported_parameter;
3448		ctx->seen |= SHMEM_SEEN_HUGE;
3449		break;
3450	case Opt_mpol:
3451		if (IS_ENABLED(CONFIG_NUMA)) {
3452			mpol_put(ctx->mpol);
3453			ctx->mpol = NULL;
3454			if (mpol_parse_str(param->string, &ctx->mpol))
3455				goto bad_value;
3456			break;
3457		}
3458		goto unsupported_parameter;
3459	case Opt_inode32:
3460		ctx->full_inums = false;
3461		ctx->seen |= SHMEM_SEEN_INUMS;
3462		break;
3463	case Opt_inode64:
3464		if (sizeof(ino_t) < 8) {
3465			return invalfc(fc,
3466				       "Cannot use inode64 with <64bit inums in kernel\n");
3467		}
3468		ctx->full_inums = true;
3469		ctx->seen |= SHMEM_SEEN_INUMS;
3470		break;
3471	}
3472	return 0;
3473
3474unsupported_parameter:
3475	return invalfc(fc, "Unsupported parameter '%s'", param->key);
3476bad_value:
3477	return invalfc(fc, "Bad value for '%s'", param->key);
3478}
3479
3480static int shmem_parse_options(struct fs_context *fc, void *data)
3481{
3482	char *options = data;
3483
3484	if (options) {
3485		int err = security_sb_eat_lsm_opts(options, &fc->security);
3486		if (err)
3487			return err;
3488	}
3489
3490	while (options != NULL) {
3491		char *this_char = options;
3492		for (;;) {
3493			/*
3494			 * NUL-terminate this option: unfortunately,
3495			 * mount options form a comma-separated list,
3496			 * but mpol's nodelist may also contain commas.
3497			 */
3498			options = strchr(options, ',');
3499			if (options == NULL)
3500				break;
3501			options++;
3502			if (!isdigit(*options)) {
3503				options[-1] = '\0';
3504				break;
3505			}
3506		}
3507		if (*this_char) {
3508			char *value = strchr(this_char,'=');
3509			size_t len = 0;
3510			int err;
3511
3512			if (value) {
3513				*value++ = '\0';
3514				len = strlen(value);
3515			}
3516			err = vfs_parse_fs_string(fc, this_char, value, len);
3517			if (err < 0)
3518				return err;
3519		}
3520	}
3521	return 0;
3522}
3523
3524/*
3525 * Reconfigure a shmem filesystem.
3526 *
3527 * Note that we disallow change from limited->unlimited blocks/inodes while any
3528 * are in use; but we must separately disallow unlimited->limited, because in
3529 * that case we have no record of how much is already in use.
3530 */
3531static int shmem_reconfigure(struct fs_context *fc)
3532{
3533	struct shmem_options *ctx = fc->fs_private;
3534	struct shmem_sb_info *sbinfo = SHMEM_SB(fc->root->d_sb);
3535	unsigned long inodes;
3536	const char *err;
3537
3538	spin_lock(&sbinfo->stat_lock);
3539	inodes = sbinfo->max_inodes - sbinfo->free_inodes;
3540	if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) {
3541		if (!sbinfo->max_blocks) {
3542			err = "Cannot retroactively limit size";
3543			goto out;
3544		}
3545		if (percpu_counter_compare(&sbinfo->used_blocks,
3546					   ctx->blocks) > 0) {
3547			err = "Too small a size for current use";
3548			goto out;
3549		}
3550	}
3551	if ((ctx->seen & SHMEM_SEEN_INODES) && ctx->inodes) {
3552		if (!sbinfo->max_inodes) {
3553			err = "Cannot retroactively limit inodes";
3554			goto out;
3555		}
3556		if (ctx->inodes < inodes) {
3557			err = "Too few inodes for current use";
3558			goto out;
3559		}
3560	}
3561
3562	if ((ctx->seen & SHMEM_SEEN_INUMS) && !ctx->full_inums &&
3563	    sbinfo->next_ino > UINT_MAX) {
3564		err = "Current inum too high to switch to 32-bit inums";
3565		goto out;
3566	}
3567
3568	if (ctx->seen & SHMEM_SEEN_HUGE)
3569		sbinfo->huge = ctx->huge;
3570	if (ctx->seen & SHMEM_SEEN_INUMS)
3571		sbinfo->full_inums = ctx->full_inums;
3572	if (ctx->seen & SHMEM_SEEN_BLOCKS)
3573		sbinfo->max_blocks  = ctx->blocks;
3574	if (ctx->seen & SHMEM_SEEN_INODES) {
3575		sbinfo->max_inodes  = ctx->inodes;
3576		sbinfo->free_inodes = ctx->inodes - inodes;
3577	}
3578
3579	/*
3580	 * Preserve previous mempolicy unless mpol remount option was specified.
3581	 */
3582	if (ctx->mpol) {
3583		mpol_put(sbinfo->mpol);
3584		sbinfo->mpol = ctx->mpol;	/* transfers initial ref */
3585		ctx->mpol = NULL;
3586	}
3587	spin_unlock(&sbinfo->stat_lock);
3588	return 0;
3589out:
3590	spin_unlock(&sbinfo->stat_lock);
3591	return invalfc(fc, "%s", err);
3592}
3593
3594static int shmem_show_options(struct seq_file *seq, struct dentry *root)
3595{
3596	struct shmem_sb_info *sbinfo = SHMEM_SB(root->d_sb);
3597
3598	if (sbinfo->max_blocks != shmem_default_max_blocks())
3599		seq_printf(seq, ",size=%luk",
3600			sbinfo->max_blocks << (PAGE_SHIFT - 10));
3601	if (sbinfo->max_inodes != shmem_default_max_inodes())
3602		seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes);
3603	if (sbinfo->mode != (0777 | S_ISVTX))
3604		seq_printf(seq, ",mode=%03ho", sbinfo->mode);
3605	if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID))
3606		seq_printf(seq, ",uid=%u",
3607				from_kuid_munged(&init_user_ns, sbinfo->uid));
3608	if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID))
3609		seq_printf(seq, ",gid=%u",
3610				from_kgid_munged(&init_user_ns, sbinfo->gid));
3611
3612	/*
3613	 * Showing inode{64,32} might be useful even if it's the system default,
3614	 * since then people don't have to resort to checking both here and
3615	 * /proc/config.gz to confirm 64-bit inums were successfully applied
3616	 * (which may not even exist if IKCONFIG_PROC isn't enabled).
3617	 *
3618	 * We hide it when inode64 isn't the default and we are using 32-bit
3619	 * inodes, since that probably just means the feature isn't even under
3620	 * consideration.
3621	 *
3622	 * As such:
3623	 *
3624	 *                     +-----------------+-----------------+
3625	 *                     | TMPFS_INODE64=y | TMPFS_INODE64=n |
3626	 *  +------------------+-----------------+-----------------+
3627	 *  | full_inums=true  | show            | show            |
3628	 *  | full_inums=false | show            | hide            |
3629	 *  +------------------+-----------------+-----------------+
3630	 *
3631	 */
3632	if (IS_ENABLED(CONFIG_TMPFS_INODE64) || sbinfo->full_inums)
3633		seq_printf(seq, ",inode%d", (sbinfo->full_inums ? 64 : 32));
3634#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3635	/* Rightly or wrongly, show huge mount option unmasked by shmem_huge */
3636	if (sbinfo->huge)
3637		seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge));
3638#endif
3639	shmem_show_mpol(seq, sbinfo->mpol);
3640	return 0;
3641}
3642
3643#endif /* CONFIG_TMPFS */
3644
3645static void shmem_put_super(struct super_block *sb)
3646{
3647	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
3648
3649	free_percpu(sbinfo->ino_batch);
3650	percpu_counter_destroy(&sbinfo->used_blocks);
3651	mpol_put(sbinfo->mpol);
3652	kfree(sbinfo);
3653	sb->s_fs_info = NULL;
3654}
3655
3656static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
3657{
3658	struct shmem_options *ctx = fc->fs_private;
3659	struct inode *inode;
3660	struct shmem_sb_info *sbinfo;
3661	int err = -ENOMEM;
3662
3663	/* Round up to L1_CACHE_BYTES to resist false sharing */
3664	sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info),
3665				L1_CACHE_BYTES), GFP_KERNEL);
3666	if (!sbinfo)
3667		return -ENOMEM;
3668
3669	sb->s_fs_info = sbinfo;
3670
3671#ifdef CONFIG_TMPFS
3672	/*
3673	 * Per default we only allow half of the physical ram per
3674	 * tmpfs instance, limiting inodes to one per page of lowmem;
3675	 * but the internal instance is left unlimited.
3676	 */
3677	if (!(sb->s_flags & SB_KERNMOUNT)) {
3678		if (!(ctx->seen & SHMEM_SEEN_BLOCKS))
3679			ctx->blocks = shmem_default_max_blocks();
3680		if (!(ctx->seen & SHMEM_SEEN_INODES))
3681			ctx->inodes = shmem_default_max_inodes();
3682		if (!(ctx->seen & SHMEM_SEEN_INUMS))
3683			ctx->full_inums = IS_ENABLED(CONFIG_TMPFS_INODE64);
3684	} else {
3685		sb->s_flags |= SB_NOUSER;
3686	}
3687	sb->s_export_op = &shmem_export_ops;
3688	sb->s_flags |= SB_NOSEC;
3689#else
3690	sb->s_flags |= SB_NOUSER;
3691#endif
3692	sbinfo->max_blocks = ctx->blocks;
3693	sbinfo->free_inodes = sbinfo->max_inodes = ctx->inodes;
3694	if (sb->s_flags & SB_KERNMOUNT) {
3695		sbinfo->ino_batch = alloc_percpu(ino_t);
3696		if (!sbinfo->ino_batch)
3697			goto failed;
3698	}
3699	sbinfo->uid = ctx->uid;
3700	sbinfo->gid = ctx->gid;
3701	sbinfo->full_inums = ctx->full_inums;
3702	sbinfo->mode = ctx->mode;
3703	sbinfo->huge = ctx->huge;
3704	sbinfo->mpol = ctx->mpol;
3705	ctx->mpol = NULL;
3706
3707	spin_lock_init(&sbinfo->stat_lock);
3708	if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL))
3709		goto failed;
3710	spin_lock_init(&sbinfo->shrinklist_lock);
3711	INIT_LIST_HEAD(&sbinfo->shrinklist);
3712
3713	sb->s_maxbytes = MAX_LFS_FILESIZE;
3714	sb->s_blocksize = PAGE_SIZE;
3715	sb->s_blocksize_bits = PAGE_SHIFT;
3716	sb->s_magic = TMPFS_MAGIC;
3717	sb->s_op = &shmem_ops;
3718	sb->s_time_gran = 1;
3719#ifdef CONFIG_TMPFS_XATTR
3720	sb->s_xattr = shmem_xattr_handlers;
3721#endif
3722#ifdef CONFIG_TMPFS_POSIX_ACL
3723	sb->s_flags |= SB_POSIXACL;
3724#endif
3725	uuid_gen(&sb->s_uuid);
3726
3727	inode = shmem_get_inode(sb, NULL, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE);
3728	if (!inode)
3729		goto failed;
3730	inode->i_uid = sbinfo->uid;
3731	inode->i_gid = sbinfo->gid;
3732	sb->s_root = d_make_root(inode);
3733	if (!sb->s_root)
3734		goto failed;
3735	return 0;
3736
3737failed:
3738	shmem_put_super(sb);
3739	return err;
3740}
3741
3742static int shmem_get_tree(struct fs_context *fc)
3743{
3744	return get_tree_nodev(fc, shmem_fill_super);
3745}
3746
3747static void shmem_free_fc(struct fs_context *fc)
3748{
3749	struct shmem_options *ctx = fc->fs_private;
3750
3751	if (ctx) {
3752		mpol_put(ctx->mpol);
3753		kfree(ctx);
3754	}
3755}
3756
3757static const struct fs_context_operations shmem_fs_context_ops = {
3758	.free			= shmem_free_fc,
3759	.get_tree		= shmem_get_tree,
3760#ifdef CONFIG_TMPFS
3761	.parse_monolithic	= shmem_parse_options,
3762	.parse_param		= shmem_parse_one,
3763	.reconfigure		= shmem_reconfigure,
3764#endif
3765};
3766
3767static struct kmem_cache *shmem_inode_cachep;
3768
3769static struct inode *shmem_alloc_inode(struct super_block *sb)
3770{
3771	struct shmem_inode_info *info;
3772	info = kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL);
3773	if (!info)
3774		return NULL;
3775	return &info->vfs_inode;
3776}
3777
3778static void shmem_free_in_core_inode(struct inode *inode)
3779{
3780	if (S_ISLNK(inode->i_mode))
3781		kfree(inode->i_link);
3782	kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
3783}
3784
3785static void shmem_destroy_inode(struct inode *inode)
3786{
3787	if (S_ISREG(inode->i_mode))
3788		mpol_free_shared_policy(&SHMEM_I(inode)->policy);
3789}
3790
3791static void shmem_init_inode(void *foo)
3792{
3793	struct shmem_inode_info *info = foo;
3794	inode_init_once(&info->vfs_inode);
3795}
3796
3797static void shmem_init_inodecache(void)
3798{
3799	shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
3800				sizeof(struct shmem_inode_info),
3801				0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode);
3802}
3803
3804static void shmem_destroy_inodecache(void)
3805{
3806	kmem_cache_destroy(shmem_inode_cachep);
3807}
3808
3809const struct address_space_operations shmem_aops = {
3810	.writepage	= shmem_writepage,
3811	.set_page_dirty	= __set_page_dirty_no_writeback,
3812#ifdef CONFIG_TMPFS
3813	.write_begin	= shmem_write_begin,
3814	.write_end	= shmem_write_end,
3815#endif
3816#ifdef CONFIG_MIGRATION
3817	.migratepage	= migrate_page,
3818#endif
3819	.error_remove_page = generic_error_remove_page,
3820};
3821EXPORT_SYMBOL(shmem_aops);
3822
3823static const struct file_operations shmem_file_operations = {
3824	.mmap		= shmem_mmap,
3825	.get_unmapped_area = shmem_get_unmapped_area,
3826#ifdef CONFIG_TMPFS
3827	.llseek		= shmem_file_llseek,
3828	.read_iter	= shmem_file_read_iter,
3829	.write_iter	= generic_file_write_iter,
3830	.fsync		= noop_fsync,
3831	.splice_read	= generic_file_splice_read,
3832	.splice_write	= iter_file_splice_write,
3833	.fallocate	= shmem_fallocate,
3834#endif
3835};
3836
3837static const struct inode_operations shmem_inode_operations = {
3838	.getattr	= shmem_getattr,
3839	.setattr	= shmem_setattr,
3840#ifdef CONFIG_TMPFS_XATTR
3841	.listxattr	= shmem_listxattr,
3842	.set_acl	= simple_set_acl,
3843#endif
3844};
3845
3846static const struct inode_operations shmem_dir_inode_operations = {
3847#ifdef CONFIG_TMPFS
3848	.create		= shmem_create,
3849	.lookup		= simple_lookup,
3850	.link		= shmem_link,
3851	.unlink		= shmem_unlink,
3852	.symlink	= shmem_symlink,
3853	.mkdir		= shmem_mkdir,
3854	.rmdir		= shmem_rmdir,
3855	.mknod		= shmem_mknod,
3856	.rename		= shmem_rename2,
3857	.tmpfile	= shmem_tmpfile,
3858#endif
3859#ifdef CONFIG_TMPFS_XATTR
3860	.listxattr	= shmem_listxattr,
3861#endif
3862#ifdef CONFIG_TMPFS_POSIX_ACL
3863	.setattr	= shmem_setattr,
3864	.set_acl	= simple_set_acl,
3865#endif
3866};
3867
3868static const struct inode_operations shmem_special_inode_operations = {
3869#ifdef CONFIG_TMPFS_XATTR
3870	.listxattr	= shmem_listxattr,
3871#endif
3872#ifdef CONFIG_TMPFS_POSIX_ACL
3873	.setattr	= shmem_setattr,
3874	.set_acl	= simple_set_acl,
3875#endif
3876};
3877
3878static const struct super_operations shmem_ops = {
3879	.alloc_inode	= shmem_alloc_inode,
3880	.free_inode	= shmem_free_in_core_inode,
3881	.destroy_inode	= shmem_destroy_inode,
3882#ifdef CONFIG_TMPFS
3883	.statfs		= shmem_statfs,
3884	.show_options	= shmem_show_options,
3885#endif
3886	.evict_inode	= shmem_evict_inode,
3887	.drop_inode	= generic_delete_inode,
3888	.put_super	= shmem_put_super,
3889#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3890	.nr_cached_objects	= shmem_unused_huge_count,
3891	.free_cached_objects	= shmem_unused_huge_scan,
3892#endif
3893};
3894
3895static const struct vm_operations_struct shmem_vm_ops = {
3896	.fault		= shmem_fault,
3897	.map_pages	= filemap_map_pages,
3898#ifdef CONFIG_NUMA
3899	.set_policy     = shmem_set_policy,
3900	.get_policy     = shmem_get_policy,
3901#endif
3902};
3903
3904int shmem_init_fs_context(struct fs_context *fc)
3905{
3906	struct shmem_options *ctx;
3907
3908	ctx = kzalloc(sizeof(struct shmem_options), GFP_KERNEL);
3909	if (!ctx)
3910		return -ENOMEM;
3911
3912	ctx->mode = 0777 | S_ISVTX;
3913	ctx->uid = current_fsuid();
3914	ctx->gid = current_fsgid();
3915
3916	fc->fs_private = ctx;
3917	fc->ops = &shmem_fs_context_ops;
3918	return 0;
3919}
3920
3921static struct file_system_type shmem_fs_type = {
3922	.owner		= THIS_MODULE,
3923	.name		= "tmpfs",
3924	.init_fs_context = shmem_init_fs_context,
3925#ifdef CONFIG_TMPFS
3926	.parameters	= shmem_fs_parameters,
3927#endif
3928	.kill_sb	= kill_litter_super,
3929	.fs_flags	= FS_USERNS_MOUNT | FS_THP_SUPPORT,
3930};
3931
3932int __init shmem_init(void)
3933{
3934	int error;
3935
3936	shmem_init_inodecache();
3937
3938	error = register_filesystem(&shmem_fs_type);
3939	if (error) {
3940		pr_err("Could not register tmpfs\n");
3941		goto out2;
3942	}
3943
3944	shm_mnt = kern_mount(&shmem_fs_type);
3945	if (IS_ERR(shm_mnt)) {
3946		error = PTR_ERR(shm_mnt);
3947		pr_err("Could not kern_mount tmpfs\n");
3948		goto out1;
3949	}
3950
3951#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3952	if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY)
3953		SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
3954	else
3955		shmem_huge = 0; /* just in case it was patched */
3956#endif
3957	return 0;
3958
3959out1:
3960	unregister_filesystem(&shmem_fs_type);
3961out2:
3962	shmem_destroy_inodecache();
3963	shm_mnt = ERR_PTR(error);
3964	return error;
3965}
3966
3967#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_SYSFS)
3968static ssize_t shmem_enabled_show(struct kobject *kobj,
3969				  struct kobj_attribute *attr, char *buf)
3970{
3971	static const int values[] = {
3972		SHMEM_HUGE_ALWAYS,
3973		SHMEM_HUGE_WITHIN_SIZE,
3974		SHMEM_HUGE_ADVISE,
3975		SHMEM_HUGE_NEVER,
3976		SHMEM_HUGE_DENY,
3977		SHMEM_HUGE_FORCE,
3978	};
3979	int len = 0;
3980	int i;
3981
3982	for (i = 0; i < ARRAY_SIZE(values); i++) {
3983		len += sysfs_emit_at(buf, len,
3984				     shmem_huge == values[i] ? "%s[%s]" : "%s%s",
3985				     i ? " " : "",
3986				     shmem_format_huge(values[i]));
3987	}
3988
3989	len += sysfs_emit_at(buf, len, "\n");
3990
3991	return len;
3992}
3993
3994static ssize_t shmem_enabled_store(struct kobject *kobj,
3995		struct kobj_attribute *attr, const char *buf, size_t count)
3996{
3997	char tmp[16];
3998	int huge;
3999
4000	if (count + 1 > sizeof(tmp))
4001		return -EINVAL;
4002	memcpy(tmp, buf, count);
4003	tmp[count] = '\0';
4004	if (count && tmp[count - 1] == '\n')
4005		tmp[count - 1] = '\0';
4006
4007	huge = shmem_parse_huge(tmp);
4008	if (huge == -EINVAL)
4009		return -EINVAL;
4010	if (!has_transparent_hugepage() &&
4011			huge != SHMEM_HUGE_NEVER && huge != SHMEM_HUGE_DENY)
4012		return -EINVAL;
4013
4014	shmem_huge = huge;
4015	if (shmem_huge > SHMEM_HUGE_DENY)
4016		SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
4017	return count;
4018}
4019
4020struct kobj_attribute shmem_enabled_attr =
4021	__ATTR(shmem_enabled, 0644, shmem_enabled_show, shmem_enabled_store);
4022#endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_SYSFS */
4023
4024#ifdef CONFIG_TRANSPARENT_HUGEPAGE
4025bool shmem_huge_enabled(struct vm_area_struct *vma)
4026{
4027	struct inode *inode = file_inode(vma->vm_file);
4028	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
4029	loff_t i_size;
4030	pgoff_t off;
4031
4032	if ((vma->vm_flags & VM_NOHUGEPAGE) ||
4033	    test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
4034		return false;
4035	if (shmem_huge == SHMEM_HUGE_FORCE)
4036		return true;
4037	if (shmem_huge == SHMEM_HUGE_DENY)
4038		return false;
4039	switch (sbinfo->huge) {
4040		case SHMEM_HUGE_NEVER:
4041			return false;
4042		case SHMEM_HUGE_ALWAYS:
4043			return true;
4044		case SHMEM_HUGE_WITHIN_SIZE:
4045			off = round_up(vma->vm_pgoff, HPAGE_PMD_NR);
4046			i_size = round_up(i_size_read(inode), PAGE_SIZE);
4047			if (i_size >= HPAGE_PMD_SIZE &&
4048					i_size >> PAGE_SHIFT >= off)
4049				return true;
4050			fallthrough;
4051		case SHMEM_HUGE_ADVISE:
4052			/* TODO: implement fadvise() hints */
4053			return (vma->vm_flags & VM_HUGEPAGE);
4054		default:
4055			VM_BUG_ON(1);
4056			return false;
4057	}
4058}
4059#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
4060
4061#else /* !CONFIG_SHMEM */
4062
4063/*
4064 * tiny-shmem: simple shmemfs and tmpfs using ramfs code
4065 *
4066 * This is intended for small system where the benefits of the full
4067 * shmem code (swap-backed and resource-limited) are outweighed by
4068 * their complexity. On systems without swap this code should be
4069 * effectively equivalent, but much lighter weight.
4070 */
4071
4072static struct file_system_type shmem_fs_type = {
4073	.name		= "tmpfs",
4074	.init_fs_context = ramfs_init_fs_context,
4075	.parameters	= ramfs_fs_parameters,
4076	.kill_sb	= kill_litter_super,
4077	.fs_flags	= FS_USERNS_MOUNT,
4078};
4079
4080int __init shmem_init(void)
4081{
4082	BUG_ON(register_filesystem(&shmem_fs_type) != 0);
4083
4084	shm_mnt = kern_mount(&shmem_fs_type);
4085	BUG_ON(IS_ERR(shm_mnt));
4086
4087	return 0;
4088}
4089
4090int shmem_unuse(unsigned int type, bool frontswap,
4091		unsigned long *fs_pages_to_unuse)
4092{
4093	return 0;
4094}
4095
4096int shmem_lock(struct file *file, int lock, struct user_struct *user)
4097{
4098	return 0;
4099}
4100
4101void shmem_unlock_mapping(struct address_space *mapping)
4102{
4103}
4104
4105#ifdef CONFIG_MMU
4106unsigned long shmem_get_unmapped_area(struct file *file,
4107				      unsigned long addr, unsigned long len,
4108				      unsigned long pgoff, unsigned long flags)
4109{
4110	return current->mm->get_unmapped_area(file, addr, len, pgoff, flags);
4111}
4112#endif
4113
4114void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
4115{
4116	truncate_inode_pages_range(inode->i_mapping, lstart, lend);
4117}
4118EXPORT_SYMBOL_GPL(shmem_truncate_range);
4119
4120#define shmem_vm_ops				generic_file_vm_ops
4121#define shmem_file_operations			ramfs_file_operations
4122#define shmem_get_inode(sb, dir, mode, dev, flags)	ramfs_get_inode(sb, dir, mode, dev)
4123#define shmem_acct_size(flags, size)		0
4124#define shmem_unacct_size(flags, size)		do {} while (0)
4125
4126#endif /* CONFIG_SHMEM */
4127
4128/* common code */
4129
4130static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, loff_t size,
4131				       unsigned long flags, unsigned int i_flags)
4132{
4133	struct inode *inode;
4134	struct file *res;
4135
4136	if (IS_ERR(mnt))
4137		return ERR_CAST(mnt);
4138
4139	if (size < 0 || size > MAX_LFS_FILESIZE)
4140		return ERR_PTR(-EINVAL);
4141
4142	if (shmem_acct_size(flags, size))
4143		return ERR_PTR(-ENOMEM);
4144
4145	inode = shmem_get_inode(mnt->mnt_sb, NULL, S_IFREG | S_IRWXUGO, 0,
4146				flags);
4147	if (unlikely(!inode)) {
4148		shmem_unacct_size(flags, size);
4149		return ERR_PTR(-ENOSPC);
4150	}
4151	inode->i_flags |= i_flags;
4152	inode->i_size = size;
4153	clear_nlink(inode);	/* It is unlinked */
4154	res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size));
4155	if (!IS_ERR(res))
4156		res = alloc_file_pseudo(inode, mnt, name, O_RDWR,
4157				&shmem_file_operations);
4158	if (IS_ERR(res))
4159		iput(inode);
4160	return res;
4161}
4162
4163/**
4164 * shmem_kernel_file_setup - get an unlinked file living in tmpfs which must be
4165 * 	kernel internal.  There will be NO LSM permission checks against the
4166 * 	underlying inode.  So users of this interface must do LSM checks at a
4167 *	higher layer.  The users are the big_key and shm implementations.  LSM
4168 *	checks are provided at the key or shm level rather than the inode.
4169 * @name: name for dentry (to be seen in /proc/<pid>/maps
4170 * @size: size to be set for the file
4171 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
4172 */
4173struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags)
4174{
4175	return __shmem_file_setup(shm_mnt, name, size, flags, S_PRIVATE);
4176}
4177
4178/**
4179 * shmem_file_setup - get an unlinked file living in tmpfs
4180 * @name: name for dentry (to be seen in /proc/<pid>/maps
4181 * @size: size to be set for the file
4182 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
4183 */
4184struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags)
4185{
4186	return __shmem_file_setup(shm_mnt, name, size, flags, 0);
4187}
4188EXPORT_SYMBOL_GPL(shmem_file_setup);
4189
4190/**
4191 * shmem_file_setup_with_mnt - get an unlinked file living in tmpfs
4192 * @mnt: the tmpfs mount where the file will be created
4193 * @name: name for dentry (to be seen in /proc/<pid>/maps
4194 * @size: size to be set for the file
4195 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
4196 */
4197struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, const char *name,
4198				       loff_t size, unsigned long flags)
4199{
4200	return __shmem_file_setup(mnt, name, size, flags, 0);
4201}
4202EXPORT_SYMBOL_GPL(shmem_file_setup_with_mnt);
4203
4204/**
4205 * shmem_zero_setup - setup a shared anonymous mapping
4206 * @vma: the vma to be mmapped is prepared by do_mmap
4207 */
4208int shmem_zero_setup(struct vm_area_struct *vma)
4209{
4210	struct file *file;
4211	loff_t size = vma->vm_end - vma->vm_start;
4212
4213	/*
4214	 * Cloning a new file under mmap_lock leads to a lock ordering conflict
4215	 * between XFS directory reading and selinux: since this file is only
4216	 * accessible to the user through its mapping, use S_PRIVATE flag to
4217	 * bypass file security, in the same way as shmem_kernel_file_setup().
4218	 */
4219	file = shmem_kernel_file_setup("dev/zero", size, vma->vm_flags);
4220	if (IS_ERR(file))
4221		return PTR_ERR(file);
4222
4223	if (vma->vm_file)
4224		fput(vma->vm_file);
4225	vma->vm_file = file;
4226	vma->vm_ops = &shmem_vm_ops;
4227
4228	if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
4229			((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) <
4230			(vma->vm_end & HPAGE_PMD_MASK)) {
4231		khugepaged_enter(vma, vma->vm_flags);
4232	}
4233
4234	return 0;
4235}
4236
4237/**
4238 * shmem_read_mapping_page_gfp - read into page cache, using specified page allocation flags.
4239 * @mapping:	the page's address_space
4240 * @index:	the page index
4241 * @gfp:	the page allocator flags to use if allocating
4242 *
4243 * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)",
4244 * with any new page allocations done using the specified allocation flags.
4245 * But read_cache_page_gfp() uses the ->readpage() method: which does not
4246 * suit tmpfs, since it may have pages in swapcache, and needs to find those
4247 * for itself; although drivers/gpu/drm i915 and ttm rely upon this support.
4248 *
4249 * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in
4250 * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily.
4251 */
4252struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
4253					 pgoff_t index, gfp_t gfp)
4254{
4255#ifdef CONFIG_SHMEM
4256	struct inode *inode = mapping->host;
4257	struct page *page;
4258	int error;
4259
4260	BUG_ON(!shmem_mapping(mapping));
4261	error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE,
4262				  gfp, NULL, NULL, NULL);
4263	if (error)
4264		page = ERR_PTR(error);
4265	else
4266		unlock_page(page);
4267	return page;
4268#else
4269	/*
4270	 * The tiny !SHMEM case uses ramfs without swap
4271	 */
4272	return read_cache_page_gfp(mapping, index, gfp);
4273#endif
4274}
4275EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp);
Configure Feed

Configure Feed