mm/shmem.c at v4.9 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / mm / shmem.c
at v4.9 109 kB view raw
   1/*
   2 * Resizable virtual memory filesystem for Linux.
   3 *
   4 * Copyright (C) 2000 Linus Torvalds.
   5 *		 2000 Transmeta Corp.
   6 *		 2000-2001 Christoph Rohland
   7 *		 2000-2001 SAP AG
   8 *		 2002 Red Hat Inc.
   9 * Copyright (C) 2002-2011 Hugh Dickins.
  10 * Copyright (C) 2011 Google Inc.
  11 * Copyright (C) 2002-2005 VERITAS Software Corporation.
  12 * Copyright (C) 2004 Andi Kleen, SuSE Labs
  13 *
  14 * Extended attribute support for tmpfs:
  15 * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net>
  16 * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
  17 *
  18 * tiny-shmem:
  19 * Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com>
  20 *
  21 * This file is released under the GPL.
  22 */
  23
  24#include <linux/fs.h>
  25#include <linux/init.h>
  26#include <linux/vfs.h>
  27#include <linux/mount.h>
  28#include <linux/ramfs.h>
  29#include <linux/pagemap.h>
  30#include <linux/file.h>
  31#include <linux/mm.h>
  32#include <linux/export.h>
  33#include <linux/swap.h>
  34#include <linux/uio.h>
  35#include <linux/khugepaged.h>
  36
  37static struct vfsmount *shm_mnt;
  38
  39#ifdef CONFIG_SHMEM
  40/*
  41 * This virtual memory filesystem is heavily based on the ramfs. It
  42 * extends ramfs by the ability to use swap and honor resource limits
  43 * which makes it a completely usable filesystem.
  44 */
  45
  46#include <linux/xattr.h>
  47#include <linux/exportfs.h>
  48#include <linux/posix_acl.h>
  49#include <linux/posix_acl_xattr.h>
  50#include <linux/mman.h>
  51#include <linux/string.h>
  52#include <linux/slab.h>
  53#include <linux/backing-dev.h>
  54#include <linux/shmem_fs.h>
  55#include <linux/writeback.h>
  56#include <linux/blkdev.h>
  57#include <linux/pagevec.h>
  58#include <linux/percpu_counter.h>
  59#include <linux/falloc.h>
  60#include <linux/splice.h>
  61#include <linux/security.h>
  62#include <linux/swapops.h>
  63#include <linux/mempolicy.h>
  64#include <linux/namei.h>
  65#include <linux/ctype.h>
  66#include <linux/migrate.h>
  67#include <linux/highmem.h>
  68#include <linux/seq_file.h>
  69#include <linux/magic.h>
  70#include <linux/syscalls.h>
  71#include <linux/fcntl.h>
  72#include <uapi/linux/memfd.h>
  73
  74#include <asm/uaccess.h>
  75#include <asm/pgtable.h>
  76
  77#include "internal.h"
  78
  79#define BLOCKS_PER_PAGE  (PAGE_SIZE/512)
  80#define VM_ACCT(size)    (PAGE_ALIGN(size) >> PAGE_SHIFT)
  81
  82/* Pretend that each entry is of this size in directory's i_size */
  83#define BOGO_DIRENT_SIZE 20
  84
  85/* Symlink up to this size is kmalloc'ed instead of using a swappable page */
  86#define SHORT_SYMLINK_LEN 128
  87
  88/*
  89 * shmem_fallocate communicates with shmem_fault or shmem_writepage via
  90 * inode->i_private (with i_mutex making sure that it has only one user at
  91 * a time): we would prefer not to enlarge the shmem inode just for that.
  92 */
  93struct shmem_falloc {
  94	wait_queue_head_t *waitq; /* faults into hole wait for punch to end */
  95	pgoff_t start;		/* start of range currently being fallocated */
  96	pgoff_t next;		/* the next page offset to be fallocated */
  97	pgoff_t nr_falloced;	/* how many new pages have been fallocated */
  98	pgoff_t nr_unswapped;	/* how often writepage refused to swap out */
  99};
 100
 101#ifdef CONFIG_TMPFS
 102static unsigned long shmem_default_max_blocks(void)
 103{
 104	return totalram_pages / 2;
 105}
 106
 107static unsigned long shmem_default_max_inodes(void)
 108{
 109	return min(totalram_pages - totalhigh_pages, totalram_pages / 2);
 110}
 111#endif
 112
 113static bool shmem_should_replace_page(struct page *page, gfp_t gfp);
 114static int shmem_replace_page(struct page **pagep, gfp_t gfp,
 115				struct shmem_inode_info *info, pgoff_t index);
 116static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
 117		struct page **pagep, enum sgp_type sgp,
 118		gfp_t gfp, struct mm_struct *fault_mm, int *fault_type);
 119
 120int shmem_getpage(struct inode *inode, pgoff_t index,
 121		struct page **pagep, enum sgp_type sgp)
 122{
 123	return shmem_getpage_gfp(inode, index, pagep, sgp,
 124		mapping_gfp_mask(inode->i_mapping), NULL, NULL);
 125}
 126
 127static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
 128{
 129	return sb->s_fs_info;
 130}
 131
 132/*
 133 * shmem_file_setup pre-accounts the whole fixed size of a VM object,
 134 * for shared memory and for shared anonymous (/dev/zero) mappings
 135 * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1),
 136 * consistent with the pre-accounting of private mappings ...
 137 */
 138static inline int shmem_acct_size(unsigned long flags, loff_t size)
 139{
 140	return (flags & VM_NORESERVE) ?
 141		0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size));
 142}
 143
 144static inline void shmem_unacct_size(unsigned long flags, loff_t size)
 145{
 146	if (!(flags & VM_NORESERVE))
 147		vm_unacct_memory(VM_ACCT(size));
 148}
 149
 150static inline int shmem_reacct_size(unsigned long flags,
 151		loff_t oldsize, loff_t newsize)
 152{
 153	if (!(flags & VM_NORESERVE)) {
 154		if (VM_ACCT(newsize) > VM_ACCT(oldsize))
 155			return security_vm_enough_memory_mm(current->mm,
 156					VM_ACCT(newsize) - VM_ACCT(oldsize));
 157		else if (VM_ACCT(newsize) < VM_ACCT(oldsize))
 158			vm_unacct_memory(VM_ACCT(oldsize) - VM_ACCT(newsize));
 159	}
 160	return 0;
 161}
 162
 163/*
 164 * ... whereas tmpfs objects are accounted incrementally as
 165 * pages are allocated, in order to allow large sparse files.
 166 * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM,
 167 * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
 168 */
 169static inline int shmem_acct_block(unsigned long flags, long pages)
 170{
 171	if (!(flags & VM_NORESERVE))
 172		return 0;
 173
 174	return security_vm_enough_memory_mm(current->mm,
 175			pages * VM_ACCT(PAGE_SIZE));
 176}
 177
 178static inline void shmem_unacct_blocks(unsigned long flags, long pages)
 179{
 180	if (flags & VM_NORESERVE)
 181		vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE));
 182}
 183
 184static const struct super_operations shmem_ops;
 185static const struct address_space_operations shmem_aops;
 186static const struct file_operations shmem_file_operations;
 187static const struct inode_operations shmem_inode_operations;
 188static const struct inode_operations shmem_dir_inode_operations;
 189static const struct inode_operations shmem_special_inode_operations;
 190static const struct vm_operations_struct shmem_vm_ops;
 191static struct file_system_type shmem_fs_type;
 192
 193static LIST_HEAD(shmem_swaplist);
 194static DEFINE_MUTEX(shmem_swaplist_mutex);
 195
 196static int shmem_reserve_inode(struct super_block *sb)
 197{
 198	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
 199	if (sbinfo->max_inodes) {
 200		spin_lock(&sbinfo->stat_lock);
 201		if (!sbinfo->free_inodes) {
 202			spin_unlock(&sbinfo->stat_lock);
 203			return -ENOSPC;
 204		}
 205		sbinfo->free_inodes--;
 206		spin_unlock(&sbinfo->stat_lock);
 207	}
 208	return 0;
 209}
 210
 211static void shmem_free_inode(struct super_block *sb)
 212{
 213	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
 214	if (sbinfo->max_inodes) {
 215		spin_lock(&sbinfo->stat_lock);
 216		sbinfo->free_inodes++;
 217		spin_unlock(&sbinfo->stat_lock);
 218	}
 219}
 220
 221/**
 222 * shmem_recalc_inode - recalculate the block usage of an inode
 223 * @inode: inode to recalc
 224 *
 225 * We have to calculate the free blocks since the mm can drop
 226 * undirtied hole pages behind our back.
 227 *
 228 * But normally   info->alloced == inode->i_mapping->nrpages + info->swapped
 229 * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped)
 230 *
 231 * It has to be called with the spinlock held.
 232 */
 233static void shmem_recalc_inode(struct inode *inode)
 234{
 235	struct shmem_inode_info *info = SHMEM_I(inode);
 236	long freed;
 237
 238	freed = info->alloced - info->swapped - inode->i_mapping->nrpages;
 239	if (freed > 0) {
 240		struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
 241		if (sbinfo->max_blocks)
 242			percpu_counter_add(&sbinfo->used_blocks, -freed);
 243		info->alloced -= freed;
 244		inode->i_blocks -= freed * BLOCKS_PER_PAGE;
 245		shmem_unacct_blocks(info->flags, freed);
 246	}
 247}
 248
 249bool shmem_charge(struct inode *inode, long pages)
 250{
 251	struct shmem_inode_info *info = SHMEM_I(inode);
 252	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
 253	unsigned long flags;
 254
 255	if (shmem_acct_block(info->flags, pages))
 256		return false;
 257	spin_lock_irqsave(&info->lock, flags);
 258	info->alloced += pages;
 259	inode->i_blocks += pages * BLOCKS_PER_PAGE;
 260	shmem_recalc_inode(inode);
 261	spin_unlock_irqrestore(&info->lock, flags);
 262	inode->i_mapping->nrpages += pages;
 263
 264	if (!sbinfo->max_blocks)
 265		return true;
 266	if (percpu_counter_compare(&sbinfo->used_blocks,
 267				sbinfo->max_blocks - pages) > 0) {
 268		inode->i_mapping->nrpages -= pages;
 269		spin_lock_irqsave(&info->lock, flags);
 270		info->alloced -= pages;
 271		shmem_recalc_inode(inode);
 272		spin_unlock_irqrestore(&info->lock, flags);
 273		shmem_unacct_blocks(info->flags, pages);
 274		return false;
 275	}
 276	percpu_counter_add(&sbinfo->used_blocks, pages);
 277	return true;
 278}
 279
 280void shmem_uncharge(struct inode *inode, long pages)
 281{
 282	struct shmem_inode_info *info = SHMEM_I(inode);
 283	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
 284	unsigned long flags;
 285
 286	spin_lock_irqsave(&info->lock, flags);
 287	info->alloced -= pages;
 288	inode->i_blocks -= pages * BLOCKS_PER_PAGE;
 289	shmem_recalc_inode(inode);
 290	spin_unlock_irqrestore(&info->lock, flags);
 291
 292	if (sbinfo->max_blocks)
 293		percpu_counter_sub(&sbinfo->used_blocks, pages);
 294	shmem_unacct_blocks(info->flags, pages);
 295}
 296
 297/*
 298 * Replace item expected in radix tree by a new item, while holding tree lock.
 299 */
 300static int shmem_radix_tree_replace(struct address_space *mapping,
 301			pgoff_t index, void *expected, void *replacement)
 302{
 303	void **pslot;
 304	void *item;
 305
 306	VM_BUG_ON(!expected);
 307	VM_BUG_ON(!replacement);
 308	pslot = radix_tree_lookup_slot(&mapping->page_tree, index);
 309	if (!pslot)
 310		return -ENOENT;
 311	item = radix_tree_deref_slot_protected(pslot, &mapping->tree_lock);
 312	if (item != expected)
 313		return -ENOENT;
 314	radix_tree_replace_slot(pslot, replacement);
 315	return 0;
 316}
 317
 318/*
 319 * Sometimes, before we decide whether to proceed or to fail, we must check
 320 * that an entry was not already brought back from swap by a racing thread.
 321 *
 322 * Checking page is not enough: by the time a SwapCache page is locked, it
 323 * might be reused, and again be SwapCache, using the same swap as before.
 324 */
 325static bool shmem_confirm_swap(struct address_space *mapping,
 326			       pgoff_t index, swp_entry_t swap)
 327{
 328	void *item;
 329
 330	rcu_read_lock();
 331	item = radix_tree_lookup(&mapping->page_tree, index);
 332	rcu_read_unlock();
 333	return item == swp_to_radix_entry(swap);
 334}
 335
 336/*
 337 * Definitions for "huge tmpfs": tmpfs mounted with the huge= option
 338 *
 339 * SHMEM_HUGE_NEVER:
 340 *	disables huge pages for the mount;
 341 * SHMEM_HUGE_ALWAYS:
 342 *	enables huge pages for the mount;
 343 * SHMEM_HUGE_WITHIN_SIZE:
 344 *	only allocate huge pages if the page will be fully within i_size,
 345 *	also respect fadvise()/madvise() hints;
 346 * SHMEM_HUGE_ADVISE:
 347 *	only allocate huge pages if requested with fadvise()/madvise();
 348 */
 349
 350#define SHMEM_HUGE_NEVER	0
 351#define SHMEM_HUGE_ALWAYS	1
 352#define SHMEM_HUGE_WITHIN_SIZE	2
 353#define SHMEM_HUGE_ADVISE	3
 354
 355/*
 356 * Special values.
 357 * Only can be set via /sys/kernel/mm/transparent_hugepage/shmem_enabled:
 358 *
 359 * SHMEM_HUGE_DENY:
 360 *	disables huge on shm_mnt and all mounts, for emergency use;
 361 * SHMEM_HUGE_FORCE:
 362 *	enables huge on shm_mnt and all mounts, w/o needing option, for testing;
 363 *
 364 */
 365#define SHMEM_HUGE_DENY		(-1)
 366#define SHMEM_HUGE_FORCE	(-2)
 367
 368#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
 369/* ifdef here to avoid bloating shmem.o when not necessary */
 370
 371int shmem_huge __read_mostly;
 372
 373static int shmem_parse_huge(const char *str)
 374{
 375	if (!strcmp(str, "never"))
 376		return SHMEM_HUGE_NEVER;
 377	if (!strcmp(str, "always"))
 378		return SHMEM_HUGE_ALWAYS;
 379	if (!strcmp(str, "within_size"))
 380		return SHMEM_HUGE_WITHIN_SIZE;
 381	if (!strcmp(str, "advise"))
 382		return SHMEM_HUGE_ADVISE;
 383	if (!strcmp(str, "deny"))
 384		return SHMEM_HUGE_DENY;
 385	if (!strcmp(str, "force"))
 386		return SHMEM_HUGE_FORCE;
 387	return -EINVAL;
 388}
 389
 390static const char *shmem_format_huge(int huge)
 391{
 392	switch (huge) {
 393	case SHMEM_HUGE_NEVER:
 394		return "never";
 395	case SHMEM_HUGE_ALWAYS:
 396		return "always";
 397	case SHMEM_HUGE_WITHIN_SIZE:
 398		return "within_size";
 399	case SHMEM_HUGE_ADVISE:
 400		return "advise";
 401	case SHMEM_HUGE_DENY:
 402		return "deny";
 403	case SHMEM_HUGE_FORCE:
 404		return "force";
 405	default:
 406		VM_BUG_ON(1);
 407		return "bad_val";
 408	}
 409}
 410
 411static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
 412		struct shrink_control *sc, unsigned long nr_to_split)
 413{
 414	LIST_HEAD(list), *pos, *next;
 415	struct inode *inode;
 416	struct shmem_inode_info *info;
 417	struct page *page;
 418	unsigned long batch = sc ? sc->nr_to_scan : 128;
 419	int removed = 0, split = 0;
 420
 421	if (list_empty(&sbinfo->shrinklist))
 422		return SHRINK_STOP;
 423
 424	spin_lock(&sbinfo->shrinklist_lock);
 425	list_for_each_safe(pos, next, &sbinfo->shrinklist) {
 426		info = list_entry(pos, struct shmem_inode_info, shrinklist);
 427
 428		/* pin the inode */
 429		inode = igrab(&info->vfs_inode);
 430
 431		/* inode is about to be evicted */
 432		if (!inode) {
 433			list_del_init(&info->shrinklist);
 434			removed++;
 435			goto next;
 436		}
 437
 438		/* Check if there's anything to gain */
 439		if (round_up(inode->i_size, PAGE_SIZE) ==
 440				round_up(inode->i_size, HPAGE_PMD_SIZE)) {
 441			list_del_init(&info->shrinklist);
 442			removed++;
 443			iput(inode);
 444			goto next;
 445		}
 446
 447		list_move(&info->shrinklist, &list);
 448next:
 449		if (!--batch)
 450			break;
 451	}
 452	spin_unlock(&sbinfo->shrinklist_lock);
 453
 454	list_for_each_safe(pos, next, &list) {
 455		int ret;
 456
 457		info = list_entry(pos, struct shmem_inode_info, shrinklist);
 458		inode = &info->vfs_inode;
 459
 460		if (nr_to_split && split >= nr_to_split) {
 461			iput(inode);
 462			continue;
 463		}
 464
 465		page = find_lock_page(inode->i_mapping,
 466				(inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT);
 467		if (!page)
 468			goto drop;
 469
 470		if (!PageTransHuge(page)) {
 471			unlock_page(page);
 472			put_page(page);
 473			goto drop;
 474		}
 475
 476		ret = split_huge_page(page);
 477		unlock_page(page);
 478		put_page(page);
 479
 480		if (ret) {
 481			/* split failed: leave it on the list */
 482			iput(inode);
 483			continue;
 484		}
 485
 486		split++;
 487drop:
 488		list_del_init(&info->shrinklist);
 489		removed++;
 490		iput(inode);
 491	}
 492
 493	spin_lock(&sbinfo->shrinklist_lock);
 494	list_splice_tail(&list, &sbinfo->shrinklist);
 495	sbinfo->shrinklist_len -= removed;
 496	spin_unlock(&sbinfo->shrinklist_lock);
 497
 498	return split;
 499}
 500
 501static long shmem_unused_huge_scan(struct super_block *sb,
 502		struct shrink_control *sc)
 503{
 504	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
 505
 506	if (!READ_ONCE(sbinfo->shrinklist_len))
 507		return SHRINK_STOP;
 508
 509	return shmem_unused_huge_shrink(sbinfo, sc, 0);
 510}
 511
 512static long shmem_unused_huge_count(struct super_block *sb,
 513		struct shrink_control *sc)
 514{
 515	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
 516	return READ_ONCE(sbinfo->shrinklist_len);
 517}
 518#else /* !CONFIG_TRANSPARENT_HUGE_PAGECACHE */
 519
 520#define shmem_huge SHMEM_HUGE_DENY
 521
 522static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
 523		struct shrink_control *sc, unsigned long nr_to_split)
 524{
 525	return 0;
 526}
 527#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE */
 528
 529/*
 530 * Like add_to_page_cache_locked, but error if expected item has gone.
 531 */
 532static int shmem_add_to_page_cache(struct page *page,
 533				   struct address_space *mapping,
 534				   pgoff_t index, void *expected)
 535{
 536	int error, nr = hpage_nr_pages(page);
 537
 538	VM_BUG_ON_PAGE(PageTail(page), page);
 539	VM_BUG_ON_PAGE(index != round_down(index, nr), page);
 540	VM_BUG_ON_PAGE(!PageLocked(page), page);
 541	VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
 542	VM_BUG_ON(expected && PageTransHuge(page));
 543
 544	page_ref_add(page, nr);
 545	page->mapping = mapping;
 546	page->index = index;
 547
 548	spin_lock_irq(&mapping->tree_lock);
 549	if (PageTransHuge(page)) {
 550		void __rcu **results;
 551		pgoff_t idx;
 552		int i;
 553
 554		error = 0;
 555		if (radix_tree_gang_lookup_slot(&mapping->page_tree,
 556					&results, &idx, index, 1) &&
 557				idx < index + HPAGE_PMD_NR) {
 558			error = -EEXIST;
 559		}
 560
 561		if (!error) {
 562			for (i = 0; i < HPAGE_PMD_NR; i++) {
 563				error = radix_tree_insert(&mapping->page_tree,
 564						index + i, page + i);
 565				VM_BUG_ON(error);
 566			}
 567			count_vm_event(THP_FILE_ALLOC);
 568		}
 569	} else if (!expected) {
 570		error = radix_tree_insert(&mapping->page_tree, index, page);
 571	} else {
 572		error = shmem_radix_tree_replace(mapping, index, expected,
 573								 page);
 574	}
 575
 576	if (!error) {
 577		mapping->nrpages += nr;
 578		if (PageTransHuge(page))
 579			__inc_node_page_state(page, NR_SHMEM_THPS);
 580		__mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr);
 581		__mod_node_page_state(page_pgdat(page), NR_SHMEM, nr);
 582		spin_unlock_irq(&mapping->tree_lock);
 583	} else {
 584		page->mapping = NULL;
 585		spin_unlock_irq(&mapping->tree_lock);
 586		page_ref_sub(page, nr);
 587	}
 588	return error;
 589}
 590
 591/*
 592 * Like delete_from_page_cache, but substitutes swap for page.
 593 */
 594static void shmem_delete_from_page_cache(struct page *page, void *radswap)
 595{
 596	struct address_space *mapping = page->mapping;
 597	int error;
 598
 599	VM_BUG_ON_PAGE(PageCompound(page), page);
 600
 601	spin_lock_irq(&mapping->tree_lock);
 602	error = shmem_radix_tree_replace(mapping, page->index, page, radswap);
 603	page->mapping = NULL;
 604	mapping->nrpages--;
 605	__dec_node_page_state(page, NR_FILE_PAGES);
 606	__dec_node_page_state(page, NR_SHMEM);
 607	spin_unlock_irq(&mapping->tree_lock);
 608	put_page(page);
 609	BUG_ON(error);
 610}
 611
 612/*
 613 * Remove swap entry from radix tree, free the swap and its page cache.
 614 */
 615static int shmem_free_swap(struct address_space *mapping,
 616			   pgoff_t index, void *radswap)
 617{
 618	void *old;
 619
 620	spin_lock_irq(&mapping->tree_lock);
 621	old = radix_tree_delete_item(&mapping->page_tree, index, radswap);
 622	spin_unlock_irq(&mapping->tree_lock);
 623	if (old != radswap)
 624		return -ENOENT;
 625	free_swap_and_cache(radix_to_swp_entry(radswap));
 626	return 0;
 627}
 628
 629/*
 630 * Determine (in bytes) how many of the shmem object's pages mapped by the
 631 * given offsets are swapped out.
 632 *
 633 * This is safe to call without i_mutex or mapping->tree_lock thanks to RCU,
 634 * as long as the inode doesn't go away and racy results are not a problem.
 635 */
 636unsigned long shmem_partial_swap_usage(struct address_space *mapping,
 637						pgoff_t start, pgoff_t end)
 638{
 639	struct radix_tree_iter iter;
 640	void **slot;
 641	struct page *page;
 642	unsigned long swapped = 0;
 643
 644	rcu_read_lock();
 645
 646	radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
 647		if (iter.index >= end)
 648			break;
 649
 650		page = radix_tree_deref_slot(slot);
 651
 652		if (radix_tree_deref_retry(page)) {
 653			slot = radix_tree_iter_retry(&iter);
 654			continue;
 655		}
 656
 657		if (radix_tree_exceptional_entry(page))
 658			swapped++;
 659
 660		if (need_resched()) {
 661			cond_resched_rcu();
 662			slot = radix_tree_iter_next(&iter);
 663		}
 664	}
 665
 666	rcu_read_unlock();
 667
 668	return swapped << PAGE_SHIFT;
 669}
 670
 671/*
 672 * Determine (in bytes) how many of the shmem object's pages mapped by the
 673 * given vma is swapped out.
 674 *
 675 * This is safe to call without i_mutex or mapping->tree_lock thanks to RCU,
 676 * as long as the inode doesn't go away and racy results are not a problem.
 677 */
 678unsigned long shmem_swap_usage(struct vm_area_struct *vma)
 679{
 680	struct inode *inode = file_inode(vma->vm_file);
 681	struct shmem_inode_info *info = SHMEM_I(inode);
 682	struct address_space *mapping = inode->i_mapping;
 683	unsigned long swapped;
 684
 685	/* Be careful as we don't hold info->lock */
 686	swapped = READ_ONCE(info->swapped);
 687
 688	/*
 689	 * The easier cases are when the shmem object has nothing in swap, or
 690	 * the vma maps it whole. Then we can simply use the stats that we
 691	 * already track.
 692	 */
 693	if (!swapped)
 694		return 0;
 695
 696	if (!vma->vm_pgoff && vma->vm_end - vma->vm_start >= inode->i_size)
 697		return swapped << PAGE_SHIFT;
 698
 699	/* Here comes the more involved part */
 700	return shmem_partial_swap_usage(mapping,
 701			linear_page_index(vma, vma->vm_start),
 702			linear_page_index(vma, vma->vm_end));
 703}
 704
 705/*
 706 * SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists.
 707 */
 708void shmem_unlock_mapping(struct address_space *mapping)
 709{
 710	struct pagevec pvec;
 711	pgoff_t indices[PAGEVEC_SIZE];
 712	pgoff_t index = 0;
 713
 714	pagevec_init(&pvec, 0);
 715	/*
 716	 * Minor point, but we might as well stop if someone else SHM_LOCKs it.
 717	 */
 718	while (!mapping_unevictable(mapping)) {
 719		/*
 720		 * Avoid pagevec_lookup(): find_get_pages() returns 0 as if it
 721		 * has finished, if it hits a row of PAGEVEC_SIZE swap entries.
 722		 */
 723		pvec.nr = find_get_entries(mapping, index,
 724					   PAGEVEC_SIZE, pvec.pages, indices);
 725		if (!pvec.nr)
 726			break;
 727		index = indices[pvec.nr - 1] + 1;
 728		pagevec_remove_exceptionals(&pvec);
 729		check_move_unevictable_pages(pvec.pages, pvec.nr);
 730		pagevec_release(&pvec);
 731		cond_resched();
 732	}
 733}
 734
 735/*
 736 * Remove range of pages and swap entries from radix tree, and free them.
 737 * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
 738 */
 739static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
 740								 bool unfalloc)
 741{
 742	struct address_space *mapping = inode->i_mapping;
 743	struct shmem_inode_info *info = SHMEM_I(inode);
 744	pgoff_t start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT;
 745	pgoff_t end = (lend + 1) >> PAGE_SHIFT;
 746	unsigned int partial_start = lstart & (PAGE_SIZE - 1);
 747	unsigned int partial_end = (lend + 1) & (PAGE_SIZE - 1);
 748	struct pagevec pvec;
 749	pgoff_t indices[PAGEVEC_SIZE];
 750	long nr_swaps_freed = 0;
 751	pgoff_t index;
 752	int i;
 753
 754	if (lend == -1)
 755		end = -1;	/* unsigned, so actually very big */
 756
 757	pagevec_init(&pvec, 0);
 758	index = start;
 759	while (index < end) {
 760		pvec.nr = find_get_entries(mapping, index,
 761			min(end - index, (pgoff_t)PAGEVEC_SIZE),
 762			pvec.pages, indices);
 763		if (!pvec.nr)
 764			break;
 765		for (i = 0; i < pagevec_count(&pvec); i++) {
 766			struct page *page = pvec.pages[i];
 767
 768			index = indices[i];
 769			if (index >= end)
 770				break;
 771
 772			if (radix_tree_exceptional_entry(page)) {
 773				if (unfalloc)
 774					continue;
 775				nr_swaps_freed += !shmem_free_swap(mapping,
 776								index, page);
 777				continue;
 778			}
 779
 780			VM_BUG_ON_PAGE(page_to_pgoff(page) != index, page);
 781
 782			if (!trylock_page(page))
 783				continue;
 784
 785			if (PageTransTail(page)) {
 786				/* Middle of THP: zero out the page */
 787				clear_highpage(page);
 788				unlock_page(page);
 789				continue;
 790			} else if (PageTransHuge(page)) {
 791				if (index == round_down(end, HPAGE_PMD_NR)) {
 792					/*
 793					 * Range ends in the middle of THP:
 794					 * zero out the page
 795					 */
 796					clear_highpage(page);
 797					unlock_page(page);
 798					continue;
 799				}
 800				index += HPAGE_PMD_NR - 1;
 801				i += HPAGE_PMD_NR - 1;
 802			}
 803
 804			if (!unfalloc || !PageUptodate(page)) {
 805				VM_BUG_ON_PAGE(PageTail(page), page);
 806				if (page_mapping(page) == mapping) {
 807					VM_BUG_ON_PAGE(PageWriteback(page), page);
 808					truncate_inode_page(mapping, page);
 809				}
 810			}
 811			unlock_page(page);
 812		}
 813		pagevec_remove_exceptionals(&pvec);
 814		pagevec_release(&pvec);
 815		cond_resched();
 816		index++;
 817	}
 818
 819	if (partial_start) {
 820		struct page *page = NULL;
 821		shmem_getpage(inode, start - 1, &page, SGP_READ);
 822		if (page) {
 823			unsigned int top = PAGE_SIZE;
 824			if (start > end) {
 825				top = partial_end;
 826				partial_end = 0;
 827			}
 828			zero_user_segment(page, partial_start, top);
 829			set_page_dirty(page);
 830			unlock_page(page);
 831			put_page(page);
 832		}
 833	}
 834	if (partial_end) {
 835		struct page *page = NULL;
 836		shmem_getpage(inode, end, &page, SGP_READ);
 837		if (page) {
 838			zero_user_segment(page, 0, partial_end);
 839			set_page_dirty(page);
 840			unlock_page(page);
 841			put_page(page);
 842		}
 843	}
 844	if (start >= end)
 845		return;
 846
 847	index = start;
 848	while (index < end) {
 849		cond_resched();
 850
 851		pvec.nr = find_get_entries(mapping, index,
 852				min(end - index, (pgoff_t)PAGEVEC_SIZE),
 853				pvec.pages, indices);
 854		if (!pvec.nr) {
 855			/* If all gone or hole-punch or unfalloc, we're done */
 856			if (index == start || end != -1)
 857				break;
 858			/* But if truncating, restart to make sure all gone */
 859			index = start;
 860			continue;
 861		}
 862		for (i = 0; i < pagevec_count(&pvec); i++) {
 863			struct page *page = pvec.pages[i];
 864
 865			index = indices[i];
 866			if (index >= end)
 867				break;
 868
 869			if (radix_tree_exceptional_entry(page)) {
 870				if (unfalloc)
 871					continue;
 872				if (shmem_free_swap(mapping, index, page)) {
 873					/* Swap was replaced by page: retry */
 874					index--;
 875					break;
 876				}
 877				nr_swaps_freed++;
 878				continue;
 879			}
 880
 881			lock_page(page);
 882
 883			if (PageTransTail(page)) {
 884				/* Middle of THP: zero out the page */
 885				clear_highpage(page);
 886				unlock_page(page);
 887				/*
 888				 * Partial thp truncate due 'start' in middle
 889				 * of THP: don't need to look on these pages
 890				 * again on !pvec.nr restart.
 891				 */
 892				if (index != round_down(end, HPAGE_PMD_NR))
 893					start++;
 894				continue;
 895			} else if (PageTransHuge(page)) {
 896				if (index == round_down(end, HPAGE_PMD_NR)) {
 897					/*
 898					 * Range ends in the middle of THP:
 899					 * zero out the page
 900					 */
 901					clear_highpage(page);
 902					unlock_page(page);
 903					continue;
 904				}
 905				index += HPAGE_PMD_NR - 1;
 906				i += HPAGE_PMD_NR - 1;
 907			}
 908
 909			if (!unfalloc || !PageUptodate(page)) {
 910				VM_BUG_ON_PAGE(PageTail(page), page);
 911				if (page_mapping(page) == mapping) {
 912					VM_BUG_ON_PAGE(PageWriteback(page), page);
 913					truncate_inode_page(mapping, page);
 914				} else {
 915					/* Page was replaced by swap: retry */
 916					unlock_page(page);
 917					index--;
 918					break;
 919				}
 920			}
 921			unlock_page(page);
 922		}
 923		pagevec_remove_exceptionals(&pvec);
 924		pagevec_release(&pvec);
 925		index++;
 926	}
 927
 928	spin_lock_irq(&info->lock);
 929	info->swapped -= nr_swaps_freed;
 930	shmem_recalc_inode(inode);
 931	spin_unlock_irq(&info->lock);
 932}
 933
 934void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
 935{
 936	shmem_undo_range(inode, lstart, lend, false);
 937	inode->i_ctime = inode->i_mtime = current_time(inode);
 938}
 939EXPORT_SYMBOL_GPL(shmem_truncate_range);
 940
 941static int shmem_getattr(struct vfsmount *mnt, struct dentry *dentry,
 942			 struct kstat *stat)
 943{
 944	struct inode *inode = dentry->d_inode;
 945	struct shmem_inode_info *info = SHMEM_I(inode);
 946
 947	if (info->alloced - info->swapped != inode->i_mapping->nrpages) {
 948		spin_lock_irq(&info->lock);
 949		shmem_recalc_inode(inode);
 950		spin_unlock_irq(&info->lock);
 951	}
 952	generic_fillattr(inode, stat);
 953	return 0;
 954}
 955
 956static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
 957{
 958	struct inode *inode = d_inode(dentry);
 959	struct shmem_inode_info *info = SHMEM_I(inode);
 960	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
 961	int error;
 962
 963	error = setattr_prepare(dentry, attr);
 964	if (error)
 965		return error;
 966
 967	if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
 968		loff_t oldsize = inode->i_size;
 969		loff_t newsize = attr->ia_size;
 970
 971		/* protected by i_mutex */
 972		if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) ||
 973		    (newsize > oldsize && (info->seals & F_SEAL_GROW)))
 974			return -EPERM;
 975
 976		if (newsize != oldsize) {
 977			error = shmem_reacct_size(SHMEM_I(inode)->flags,
 978					oldsize, newsize);
 979			if (error)
 980				return error;
 981			i_size_write(inode, newsize);
 982			inode->i_ctime = inode->i_mtime = current_time(inode);
 983		}
 984		if (newsize <= oldsize) {
 985			loff_t holebegin = round_up(newsize, PAGE_SIZE);
 986			if (oldsize > holebegin)
 987				unmap_mapping_range(inode->i_mapping,
 988							holebegin, 0, 1);
 989			if (info->alloced)
 990				shmem_truncate_range(inode,
 991							newsize, (loff_t)-1);
 992			/* unmap again to remove racily COWed private pages */
 993			if (oldsize > holebegin)
 994				unmap_mapping_range(inode->i_mapping,
 995							holebegin, 0, 1);
 996
 997			/*
 998			 * Part of the huge page can be beyond i_size: subject
 999			 * to shrink under memory pressure.
1000			 */
1001			if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) {
1002				spin_lock(&sbinfo->shrinklist_lock);
1003				if (list_empty(&info->shrinklist)) {
1004					list_add_tail(&info->shrinklist,
1005							&sbinfo->shrinklist);
1006					sbinfo->shrinklist_len++;
1007				}
1008				spin_unlock(&sbinfo->shrinklist_lock);
1009			}
1010		}
1011	}
1012
1013	setattr_copy(inode, attr);
1014	if (attr->ia_valid & ATTR_MODE)
1015		error = posix_acl_chmod(inode, inode->i_mode);
1016	return error;
1017}
1018
1019static void shmem_evict_inode(struct inode *inode)
1020{
1021	struct shmem_inode_info *info = SHMEM_I(inode);
1022	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
1023
1024	if (inode->i_mapping->a_ops == &shmem_aops) {
1025		shmem_unacct_size(info->flags, inode->i_size);
1026		inode->i_size = 0;
1027		shmem_truncate_range(inode, 0, (loff_t)-1);
1028		if (!list_empty(&info->shrinklist)) {
1029			spin_lock(&sbinfo->shrinklist_lock);
1030			if (!list_empty(&info->shrinklist)) {
1031				list_del_init(&info->shrinklist);
1032				sbinfo->shrinklist_len--;
1033			}
1034			spin_unlock(&sbinfo->shrinklist_lock);
1035		}
1036		if (!list_empty(&info->swaplist)) {
1037			mutex_lock(&shmem_swaplist_mutex);
1038			list_del_init(&info->swaplist);
1039			mutex_unlock(&shmem_swaplist_mutex);
1040		}
1041	}
1042
1043	simple_xattrs_free(&info->xattrs);
1044	WARN_ON(inode->i_blocks);
1045	shmem_free_inode(inode->i_sb);
1046	clear_inode(inode);
1047}
1048
1049/*
1050 * If swap found in inode, free it and move page from swapcache to filecache.
1051 */
1052static int shmem_unuse_inode(struct shmem_inode_info *info,
1053			     swp_entry_t swap, struct page **pagep)
1054{
1055	struct address_space *mapping = info->vfs_inode.i_mapping;
1056	void *radswap;
1057	pgoff_t index;
1058	gfp_t gfp;
1059	int error = 0;
1060
1061	radswap = swp_to_radix_entry(swap);
1062	index = radix_tree_locate_item(&mapping->page_tree, radswap);
1063	if (index == -1)
1064		return -EAGAIN;	/* tell shmem_unuse we found nothing */
1065
1066	/*
1067	 * Move _head_ to start search for next from here.
1068	 * But be careful: shmem_evict_inode checks list_empty without taking
1069	 * mutex, and there's an instant in list_move_tail when info->swaplist
1070	 * would appear empty, if it were the only one on shmem_swaplist.
1071	 */
1072	if (shmem_swaplist.next != &info->swaplist)
1073		list_move_tail(&shmem_swaplist, &info->swaplist);
1074
1075	gfp = mapping_gfp_mask(mapping);
1076	if (shmem_should_replace_page(*pagep, gfp)) {
1077		mutex_unlock(&shmem_swaplist_mutex);
1078		error = shmem_replace_page(pagep, gfp, info, index);
1079		mutex_lock(&shmem_swaplist_mutex);
1080		/*
1081		 * We needed to drop mutex to make that restrictive page
1082		 * allocation, but the inode might have been freed while we
1083		 * dropped it: although a racing shmem_evict_inode() cannot
1084		 * complete without emptying the radix_tree, our page lock
1085		 * on this swapcache page is not enough to prevent that -
1086		 * free_swap_and_cache() of our swap entry will only
1087		 * trylock_page(), removing swap from radix_tree whatever.
1088		 *
1089		 * We must not proceed to shmem_add_to_page_cache() if the
1090		 * inode has been freed, but of course we cannot rely on
1091		 * inode or mapping or info to check that.  However, we can
1092		 * safely check if our swap entry is still in use (and here
1093		 * it can't have got reused for another page): if it's still
1094		 * in use, then the inode cannot have been freed yet, and we
1095		 * can safely proceed (if it's no longer in use, that tells
1096		 * nothing about the inode, but we don't need to unuse swap).
1097		 */
1098		if (!page_swapcount(*pagep))
1099			error = -ENOENT;
1100	}
1101
1102	/*
1103	 * We rely on shmem_swaplist_mutex, not only to protect the swaplist,
1104	 * but also to hold up shmem_evict_inode(): so inode cannot be freed
1105	 * beneath us (pagelock doesn't help until the page is in pagecache).
1106	 */
1107	if (!error)
1108		error = shmem_add_to_page_cache(*pagep, mapping, index,
1109						radswap);
1110	if (error != -ENOMEM) {
1111		/*
1112		 * Truncation and eviction use free_swap_and_cache(), which
1113		 * only does trylock page: if we raced, best clean up here.
1114		 */
1115		delete_from_swap_cache(*pagep);
1116		set_page_dirty(*pagep);
1117		if (!error) {
1118			spin_lock_irq(&info->lock);
1119			info->swapped--;
1120			spin_unlock_irq(&info->lock);
1121			swap_free(swap);
1122		}
1123	}
1124	return error;
1125}
1126
1127/*
1128 * Search through swapped inodes to find and replace swap by page.
1129 */
1130int shmem_unuse(swp_entry_t swap, struct page *page)
1131{
1132	struct list_head *this, *next;
1133	struct shmem_inode_info *info;
1134	struct mem_cgroup *memcg;
1135	int error = 0;
1136
1137	/*
1138	 * There's a faint possibility that swap page was replaced before
1139	 * caller locked it: caller will come back later with the right page.
1140	 */
1141	if (unlikely(!PageSwapCache(page) || page_private(page) != swap.val))
1142		goto out;
1143
1144	/*
1145	 * Charge page using GFP_KERNEL while we can wait, before taking
1146	 * the shmem_swaplist_mutex which might hold up shmem_writepage().
1147	 * Charged back to the user (not to caller) when swap account is used.
1148	 */
1149	error = mem_cgroup_try_charge(page, current->mm, GFP_KERNEL, &memcg,
1150			false);
1151	if (error)
1152		goto out;
1153	/* No radix_tree_preload: swap entry keeps a place for page in tree */
1154	error = -EAGAIN;
1155
1156	mutex_lock(&shmem_swaplist_mutex);
1157	list_for_each_safe(this, next, &shmem_swaplist) {
1158		info = list_entry(this, struct shmem_inode_info, swaplist);
1159		if (info->swapped)
1160			error = shmem_unuse_inode(info, swap, &page);
1161		else
1162			list_del_init(&info->swaplist);
1163		cond_resched();
1164		if (error != -EAGAIN)
1165			break;
1166		/* found nothing in this: move on to search the next */
1167	}
1168	mutex_unlock(&shmem_swaplist_mutex);
1169
1170	if (error) {
1171		if (error != -ENOMEM)
1172			error = 0;
1173		mem_cgroup_cancel_charge(page, memcg, false);
1174	} else
1175		mem_cgroup_commit_charge(page, memcg, true, false);
1176out:
1177	unlock_page(page);
1178	put_page(page);
1179	return error;
1180}
1181
1182/*
1183 * Move the page from the page cache to the swap cache.
1184 */
1185static int shmem_writepage(struct page *page, struct writeback_control *wbc)
1186{
1187	struct shmem_inode_info *info;
1188	struct address_space *mapping;
1189	struct inode *inode;
1190	swp_entry_t swap;
1191	pgoff_t index;
1192
1193	VM_BUG_ON_PAGE(PageCompound(page), page);
1194	BUG_ON(!PageLocked(page));
1195	mapping = page->mapping;
1196	index = page->index;
1197	inode = mapping->host;
1198	info = SHMEM_I(inode);
1199	if (info->flags & VM_LOCKED)
1200		goto redirty;
1201	if (!total_swap_pages)
1202		goto redirty;
1203
1204	/*
1205	 * Our capabilities prevent regular writeback or sync from ever calling
1206	 * shmem_writepage; but a stacking filesystem might use ->writepage of
1207	 * its underlying filesystem, in which case tmpfs should write out to
1208	 * swap only in response to memory pressure, and not for the writeback
1209	 * threads or sync.
1210	 */
1211	if (!wbc->for_reclaim) {
1212		WARN_ON_ONCE(1);	/* Still happens? Tell us about it! */
1213		goto redirty;
1214	}
1215
1216	/*
1217	 * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC
1218	 * value into swapfile.c, the only way we can correctly account for a
1219	 * fallocated page arriving here is now to initialize it and write it.
1220	 *
1221	 * That's okay for a page already fallocated earlier, but if we have
1222	 * not yet completed the fallocation, then (a) we want to keep track
1223	 * of this page in case we have to undo it, and (b) it may not be a
1224	 * good idea to continue anyway, once we're pushing into swap.  So
1225	 * reactivate the page, and let shmem_fallocate() quit when too many.
1226	 */
1227	if (!PageUptodate(page)) {
1228		if (inode->i_private) {
1229			struct shmem_falloc *shmem_falloc;
1230			spin_lock(&inode->i_lock);
1231			shmem_falloc = inode->i_private;
1232			if (shmem_falloc &&
1233			    !shmem_falloc->waitq &&
1234			    index >= shmem_falloc->start &&
1235			    index < shmem_falloc->next)
1236				shmem_falloc->nr_unswapped++;
1237			else
1238				shmem_falloc = NULL;
1239			spin_unlock(&inode->i_lock);
1240			if (shmem_falloc)
1241				goto redirty;
1242		}
1243		clear_highpage(page);
1244		flush_dcache_page(page);
1245		SetPageUptodate(page);
1246	}
1247
1248	swap = get_swap_page();
1249	if (!swap.val)
1250		goto redirty;
1251
1252	if (mem_cgroup_try_charge_swap(page, swap))
1253		goto free_swap;
1254
1255	/*
1256	 * Add inode to shmem_unuse()'s list of swapped-out inodes,
1257	 * if it's not already there.  Do it now before the page is
1258	 * moved to swap cache, when its pagelock no longer protects
1259	 * the inode from eviction.  But don't unlock the mutex until
1260	 * we've incremented swapped, because shmem_unuse_inode() will
1261	 * prune a !swapped inode from the swaplist under this mutex.
1262	 */
1263	mutex_lock(&shmem_swaplist_mutex);
1264	if (list_empty(&info->swaplist))
1265		list_add_tail(&info->swaplist, &shmem_swaplist);
1266
1267	if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
1268		spin_lock_irq(&info->lock);
1269		shmem_recalc_inode(inode);
1270		info->swapped++;
1271		spin_unlock_irq(&info->lock);
1272
1273		swap_shmem_alloc(swap);
1274		shmem_delete_from_page_cache(page, swp_to_radix_entry(swap));
1275
1276		mutex_unlock(&shmem_swaplist_mutex);
1277		BUG_ON(page_mapped(page));
1278		swap_writepage(page, wbc);
1279		return 0;
1280	}
1281
1282	mutex_unlock(&shmem_swaplist_mutex);
1283free_swap:
1284	swapcache_free(swap);
1285redirty:
1286	set_page_dirty(page);
1287	if (wbc->for_reclaim)
1288		return AOP_WRITEPAGE_ACTIVATE;	/* Return with page locked */
1289	unlock_page(page);
1290	return 0;
1291}
1292
1293#if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS)
1294static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
1295{
1296	char buffer[64];
1297
1298	if (!mpol || mpol->mode == MPOL_DEFAULT)
1299		return;		/* show nothing */
1300
1301	mpol_to_str(buffer, sizeof(buffer), mpol);
1302
1303	seq_printf(seq, ",mpol=%s", buffer);
1304}
1305
1306static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
1307{
1308	struct mempolicy *mpol = NULL;
1309	if (sbinfo->mpol) {
1310		spin_lock(&sbinfo->stat_lock);	/* prevent replace/use races */
1311		mpol = sbinfo->mpol;
1312		mpol_get(mpol);
1313		spin_unlock(&sbinfo->stat_lock);
1314	}
1315	return mpol;
1316}
1317#else /* !CONFIG_NUMA || !CONFIG_TMPFS */
1318static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
1319{
1320}
1321static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
1322{
1323	return NULL;
1324}
1325#endif /* CONFIG_NUMA && CONFIG_TMPFS */
1326#ifndef CONFIG_NUMA
1327#define vm_policy vm_private_data
1328#endif
1329
1330static void shmem_pseudo_vma_init(struct vm_area_struct *vma,
1331		struct shmem_inode_info *info, pgoff_t index)
1332{
1333	/* Create a pseudo vma that just contains the policy */
1334	vma->vm_start = 0;
1335	/* Bias interleave by inode number to distribute better across nodes */
1336	vma->vm_pgoff = index + info->vfs_inode.i_ino;
1337	vma->vm_ops = NULL;
1338	vma->vm_policy = mpol_shared_policy_lookup(&info->policy, index);
1339}
1340
1341static void shmem_pseudo_vma_destroy(struct vm_area_struct *vma)
1342{
1343	/* Drop reference taken by mpol_shared_policy_lookup() */
1344	mpol_cond_put(vma->vm_policy);
1345}
1346
1347static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
1348			struct shmem_inode_info *info, pgoff_t index)
1349{
1350	struct vm_area_struct pvma;
1351	struct page *page;
1352
1353	shmem_pseudo_vma_init(&pvma, info, index);
1354	page = swapin_readahead(swap, gfp, &pvma, 0);
1355	shmem_pseudo_vma_destroy(&pvma);
1356
1357	return page;
1358}
1359
1360static struct page *shmem_alloc_hugepage(gfp_t gfp,
1361		struct shmem_inode_info *info, pgoff_t index)
1362{
1363	struct vm_area_struct pvma;
1364	struct inode *inode = &info->vfs_inode;
1365	struct address_space *mapping = inode->i_mapping;
1366	pgoff_t idx, hindex;
1367	void __rcu **results;
1368	struct page *page;
1369
1370	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
1371		return NULL;
1372
1373	hindex = round_down(index, HPAGE_PMD_NR);
1374	rcu_read_lock();
1375	if (radix_tree_gang_lookup_slot(&mapping->page_tree, &results, &idx,
1376				hindex, 1) && idx < hindex + HPAGE_PMD_NR) {
1377		rcu_read_unlock();
1378		return NULL;
1379	}
1380	rcu_read_unlock();
1381
1382	shmem_pseudo_vma_init(&pvma, info, hindex);
1383	page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN,
1384			HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(), true);
1385	shmem_pseudo_vma_destroy(&pvma);
1386	if (page)
1387		prep_transhuge_page(page);
1388	return page;
1389}
1390
1391static struct page *shmem_alloc_page(gfp_t gfp,
1392			struct shmem_inode_info *info, pgoff_t index)
1393{
1394	struct vm_area_struct pvma;
1395	struct page *page;
1396
1397	shmem_pseudo_vma_init(&pvma, info, index);
1398	page = alloc_page_vma(gfp, &pvma, 0);
1399	shmem_pseudo_vma_destroy(&pvma);
1400
1401	return page;
1402}
1403
1404static struct page *shmem_alloc_and_acct_page(gfp_t gfp,
1405		struct shmem_inode_info *info, struct shmem_sb_info *sbinfo,
1406		pgoff_t index, bool huge)
1407{
1408	struct page *page;
1409	int nr;
1410	int err = -ENOSPC;
1411
1412	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
1413		huge = false;
1414	nr = huge ? HPAGE_PMD_NR : 1;
1415
1416	if (shmem_acct_block(info->flags, nr))
1417		goto failed;
1418	if (sbinfo->max_blocks) {
1419		if (percpu_counter_compare(&sbinfo->used_blocks,
1420					sbinfo->max_blocks - nr) > 0)
1421			goto unacct;
1422		percpu_counter_add(&sbinfo->used_blocks, nr);
1423	}
1424
1425	if (huge)
1426		page = shmem_alloc_hugepage(gfp, info, index);
1427	else
1428		page = shmem_alloc_page(gfp, info, index);
1429	if (page) {
1430		__SetPageLocked(page);
1431		__SetPageSwapBacked(page);
1432		return page;
1433	}
1434
1435	err = -ENOMEM;
1436	if (sbinfo->max_blocks)
1437		percpu_counter_add(&sbinfo->used_blocks, -nr);
1438unacct:
1439	shmem_unacct_blocks(info->flags, nr);
1440failed:
1441	return ERR_PTR(err);
1442}
1443
1444/*
1445 * When a page is moved from swapcache to shmem filecache (either by the
1446 * usual swapin of shmem_getpage_gfp(), or by the less common swapoff of
1447 * shmem_unuse_inode()), it may have been read in earlier from swap, in
1448 * ignorance of the mapping it belongs to.  If that mapping has special
1449 * constraints (like the gma500 GEM driver, which requires RAM below 4GB),
1450 * we may need to copy to a suitable page before moving to filecache.
1451 *
1452 * In a future release, this may well be extended to respect cpuset and
1453 * NUMA mempolicy, and applied also to anonymous pages in do_swap_page();
1454 * but for now it is a simple matter of zone.
1455 */
1456static bool shmem_should_replace_page(struct page *page, gfp_t gfp)
1457{
1458	return page_zonenum(page) > gfp_zone(gfp);
1459}
1460
1461static int shmem_replace_page(struct page **pagep, gfp_t gfp,
1462				struct shmem_inode_info *info, pgoff_t index)
1463{
1464	struct page *oldpage, *newpage;
1465	struct address_space *swap_mapping;
1466	pgoff_t swap_index;
1467	int error;
1468
1469	oldpage = *pagep;
1470	swap_index = page_private(oldpage);
1471	swap_mapping = page_mapping(oldpage);
1472
1473	/*
1474	 * We have arrived here because our zones are constrained, so don't
1475	 * limit chance of success by further cpuset and node constraints.
1476	 */
1477	gfp &= ~GFP_CONSTRAINT_MASK;
1478	newpage = shmem_alloc_page(gfp, info, index);
1479	if (!newpage)
1480		return -ENOMEM;
1481
1482	get_page(newpage);
1483	copy_highpage(newpage, oldpage);
1484	flush_dcache_page(newpage);
1485
1486	__SetPageLocked(newpage);
1487	__SetPageSwapBacked(newpage);
1488	SetPageUptodate(newpage);
1489	set_page_private(newpage, swap_index);
1490	SetPageSwapCache(newpage);
1491
1492	/*
1493	 * Our caller will very soon move newpage out of swapcache, but it's
1494	 * a nice clean interface for us to replace oldpage by newpage there.
1495	 */
1496	spin_lock_irq(&swap_mapping->tree_lock);
1497	error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage,
1498								   newpage);
1499	if (!error) {
1500		__inc_node_page_state(newpage, NR_FILE_PAGES);
1501		__dec_node_page_state(oldpage, NR_FILE_PAGES);
1502	}
1503	spin_unlock_irq(&swap_mapping->tree_lock);
1504
1505	if (unlikely(error)) {
1506		/*
1507		 * Is this possible?  I think not, now that our callers check
1508		 * both PageSwapCache and page_private after getting page lock;
1509		 * but be defensive.  Reverse old to newpage for clear and free.
1510		 */
1511		oldpage = newpage;
1512	} else {
1513		mem_cgroup_migrate(oldpage, newpage);
1514		lru_cache_add_anon(newpage);
1515		*pagep = newpage;
1516	}
1517
1518	ClearPageSwapCache(oldpage);
1519	set_page_private(oldpage, 0);
1520
1521	unlock_page(oldpage);
1522	put_page(oldpage);
1523	put_page(oldpage);
1524	return error;
1525}
1526
1527/*
1528 * shmem_getpage_gfp - find page in cache, or get from swap, or allocate
1529 *
1530 * If we allocate a new one we do not mark it dirty. That's up to the
1531 * vm. If we swap it in we mark it dirty since we also free the swap
1532 * entry since a page cannot live in both the swap and page cache.
1533 *
1534 * fault_mm and fault_type are only supplied by shmem_fault:
1535 * otherwise they are NULL.
1536 */
1537static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
1538	struct page **pagep, enum sgp_type sgp, gfp_t gfp,
1539	struct mm_struct *fault_mm, int *fault_type)
1540{
1541	struct address_space *mapping = inode->i_mapping;
1542	struct shmem_inode_info *info;
1543	struct shmem_sb_info *sbinfo;
1544	struct mm_struct *charge_mm;
1545	struct mem_cgroup *memcg;
1546	struct page *page;
1547	swp_entry_t swap;
1548	enum sgp_type sgp_huge = sgp;
1549	pgoff_t hindex = index;
1550	int error;
1551	int once = 0;
1552	int alloced = 0;
1553
1554	if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT))
1555		return -EFBIG;
1556	if (sgp == SGP_NOHUGE || sgp == SGP_HUGE)
1557		sgp = SGP_CACHE;
1558repeat:
1559	swap.val = 0;
1560	page = find_lock_entry(mapping, index);
1561	if (radix_tree_exceptional_entry(page)) {
1562		swap = radix_to_swp_entry(page);
1563		page = NULL;
1564	}
1565
1566	if (sgp <= SGP_CACHE &&
1567	    ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
1568		error = -EINVAL;
1569		goto unlock;
1570	}
1571
1572	if (page && sgp == SGP_WRITE)
1573		mark_page_accessed(page);
1574
1575	/* fallocated page? */
1576	if (page && !PageUptodate(page)) {
1577		if (sgp != SGP_READ)
1578			goto clear;
1579		unlock_page(page);
1580		put_page(page);
1581		page = NULL;
1582	}
1583	if (page || (sgp == SGP_READ && !swap.val)) {
1584		*pagep = page;
1585		return 0;
1586	}
1587
1588	/*
1589	 * Fast cache lookup did not find it:
1590	 * bring it back from swap or allocate.
1591	 */
1592	info = SHMEM_I(inode);
1593	sbinfo = SHMEM_SB(inode->i_sb);
1594	charge_mm = fault_mm ? : current->mm;
1595
1596	if (swap.val) {
1597		/* Look it up and read it in.. */
1598		page = lookup_swap_cache(swap);
1599		if (!page) {
1600			/* Or update major stats only when swapin succeeds?? */
1601			if (fault_type) {
1602				*fault_type |= VM_FAULT_MAJOR;
1603				count_vm_event(PGMAJFAULT);
1604				mem_cgroup_count_vm_event(fault_mm, PGMAJFAULT);
1605			}
1606			/* Here we actually start the io */
1607			page = shmem_swapin(swap, gfp, info, index);
1608			if (!page) {
1609				error = -ENOMEM;
1610				goto failed;
1611			}
1612		}
1613
1614		/* We have to do this with page locked to prevent races */
1615		lock_page(page);
1616		if (!PageSwapCache(page) || page_private(page) != swap.val ||
1617		    !shmem_confirm_swap(mapping, index, swap)) {
1618			error = -EEXIST;	/* try again */
1619			goto unlock;
1620		}
1621		if (!PageUptodate(page)) {
1622			error = -EIO;
1623			goto failed;
1624		}
1625		wait_on_page_writeback(page);
1626
1627		if (shmem_should_replace_page(page, gfp)) {
1628			error = shmem_replace_page(&page, gfp, info, index);
1629			if (error)
1630				goto failed;
1631		}
1632
1633		error = mem_cgroup_try_charge(page, charge_mm, gfp, &memcg,
1634				false);
1635		if (!error) {
1636			error = shmem_add_to_page_cache(page, mapping, index,
1637						swp_to_radix_entry(swap));
1638			/*
1639			 * We already confirmed swap under page lock, and make
1640			 * no memory allocation here, so usually no possibility
1641			 * of error; but free_swap_and_cache() only trylocks a
1642			 * page, so it is just possible that the entry has been
1643			 * truncated or holepunched since swap was confirmed.
1644			 * shmem_undo_range() will have done some of the
1645			 * unaccounting, now delete_from_swap_cache() will do
1646			 * the rest.
1647			 * Reset swap.val? No, leave it so "failed" goes back to
1648			 * "repeat": reading a hole and writing should succeed.
1649			 */
1650			if (error) {
1651				mem_cgroup_cancel_charge(page, memcg, false);
1652				delete_from_swap_cache(page);
1653			}
1654		}
1655		if (error)
1656			goto failed;
1657
1658		mem_cgroup_commit_charge(page, memcg, true, false);
1659
1660		spin_lock_irq(&info->lock);
1661		info->swapped--;
1662		shmem_recalc_inode(inode);
1663		spin_unlock_irq(&info->lock);
1664
1665		if (sgp == SGP_WRITE)
1666			mark_page_accessed(page);
1667
1668		delete_from_swap_cache(page);
1669		set_page_dirty(page);
1670		swap_free(swap);
1671
1672	} else {
1673		/* shmem_symlink() */
1674		if (mapping->a_ops != &shmem_aops)
1675			goto alloc_nohuge;
1676		if (shmem_huge == SHMEM_HUGE_DENY || sgp_huge == SGP_NOHUGE)
1677			goto alloc_nohuge;
1678		if (shmem_huge == SHMEM_HUGE_FORCE)
1679			goto alloc_huge;
1680		switch (sbinfo->huge) {
1681			loff_t i_size;
1682			pgoff_t off;
1683		case SHMEM_HUGE_NEVER:
1684			goto alloc_nohuge;
1685		case SHMEM_HUGE_WITHIN_SIZE:
1686			off = round_up(index, HPAGE_PMD_NR);
1687			i_size = round_up(i_size_read(inode), PAGE_SIZE);
1688			if (i_size >= HPAGE_PMD_SIZE &&
1689					i_size >> PAGE_SHIFT >= off)
1690				goto alloc_huge;
1691			/* fallthrough */
1692		case SHMEM_HUGE_ADVISE:
1693			if (sgp_huge == SGP_HUGE)
1694				goto alloc_huge;
1695			/* TODO: implement fadvise() hints */
1696			goto alloc_nohuge;
1697		}
1698
1699alloc_huge:
1700		page = shmem_alloc_and_acct_page(gfp, info, sbinfo,
1701				index, true);
1702		if (IS_ERR(page)) {
1703alloc_nohuge:		page = shmem_alloc_and_acct_page(gfp, info, sbinfo,
1704					index, false);
1705		}
1706		if (IS_ERR(page)) {
1707			int retry = 5;
1708			error = PTR_ERR(page);
1709			page = NULL;
1710			if (error != -ENOSPC)
1711				goto failed;
1712			/*
1713			 * Try to reclaim some spece by splitting a huge page
1714			 * beyond i_size on the filesystem.
1715			 */
1716			while (retry--) {
1717				int ret;
1718				ret = shmem_unused_huge_shrink(sbinfo, NULL, 1);
1719				if (ret == SHRINK_STOP)
1720					break;
1721				if (ret)
1722					goto alloc_nohuge;
1723			}
1724			goto failed;
1725		}
1726
1727		if (PageTransHuge(page))
1728			hindex = round_down(index, HPAGE_PMD_NR);
1729		else
1730			hindex = index;
1731
1732		if (sgp == SGP_WRITE)
1733			__SetPageReferenced(page);
1734
1735		error = mem_cgroup_try_charge(page, charge_mm, gfp, &memcg,
1736				PageTransHuge(page));
1737		if (error)
1738			goto unacct;
1739		error = radix_tree_maybe_preload_order(gfp & GFP_RECLAIM_MASK,
1740				compound_order(page));
1741		if (!error) {
1742			error = shmem_add_to_page_cache(page, mapping, hindex,
1743							NULL);
1744			radix_tree_preload_end();
1745		}
1746		if (error) {
1747			mem_cgroup_cancel_charge(page, memcg,
1748					PageTransHuge(page));
1749			goto unacct;
1750		}
1751		mem_cgroup_commit_charge(page, memcg, false,
1752				PageTransHuge(page));
1753		lru_cache_add_anon(page);
1754
1755		spin_lock_irq(&info->lock);
1756		info->alloced += 1 << compound_order(page);
1757		inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page);
1758		shmem_recalc_inode(inode);
1759		spin_unlock_irq(&info->lock);
1760		alloced = true;
1761
1762		if (PageTransHuge(page) &&
1763				DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) <
1764				hindex + HPAGE_PMD_NR - 1) {
1765			/*
1766			 * Part of the huge page is beyond i_size: subject
1767			 * to shrink under memory pressure.
1768			 */
1769			spin_lock(&sbinfo->shrinklist_lock);
1770			if (list_empty(&info->shrinklist)) {
1771				list_add_tail(&info->shrinklist,
1772						&sbinfo->shrinklist);
1773				sbinfo->shrinklist_len++;
1774			}
1775			spin_unlock(&sbinfo->shrinklist_lock);
1776		}
1777
1778		/*
1779		 * Let SGP_FALLOC use the SGP_WRITE optimization on a new page.
1780		 */
1781		if (sgp == SGP_FALLOC)
1782			sgp = SGP_WRITE;
1783clear:
1784		/*
1785		 * Let SGP_WRITE caller clear ends if write does not fill page;
1786		 * but SGP_FALLOC on a page fallocated earlier must initialize
1787		 * it now, lest undo on failure cancel our earlier guarantee.
1788		 */
1789		if (sgp != SGP_WRITE && !PageUptodate(page)) {
1790			struct page *head = compound_head(page);
1791			int i;
1792
1793			for (i = 0; i < (1 << compound_order(head)); i++) {
1794				clear_highpage(head + i);
1795				flush_dcache_page(head + i);
1796			}
1797			SetPageUptodate(head);
1798		}
1799	}
1800
1801	/* Perhaps the file has been truncated since we checked */
1802	if (sgp <= SGP_CACHE &&
1803	    ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
1804		if (alloced) {
1805			ClearPageDirty(page);
1806			delete_from_page_cache(page);
1807			spin_lock_irq(&info->lock);
1808			shmem_recalc_inode(inode);
1809			spin_unlock_irq(&info->lock);
1810		}
1811		error = -EINVAL;
1812		goto unlock;
1813	}
1814	*pagep = page + index - hindex;
1815	return 0;
1816
1817	/*
1818	 * Error recovery.
1819	 */
1820unacct:
1821	if (sbinfo->max_blocks)
1822		percpu_counter_sub(&sbinfo->used_blocks,
1823				1 << compound_order(page));
1824	shmem_unacct_blocks(info->flags, 1 << compound_order(page));
1825
1826	if (PageTransHuge(page)) {
1827		unlock_page(page);
1828		put_page(page);
1829		goto alloc_nohuge;
1830	}
1831failed:
1832	if (swap.val && !shmem_confirm_swap(mapping, index, swap))
1833		error = -EEXIST;
1834unlock:
1835	if (page) {
1836		unlock_page(page);
1837		put_page(page);
1838	}
1839	if (error == -ENOSPC && !once++) {
1840		info = SHMEM_I(inode);
1841		spin_lock_irq(&info->lock);
1842		shmem_recalc_inode(inode);
1843		spin_unlock_irq(&info->lock);
1844		goto repeat;
1845	}
1846	if (error == -EEXIST)	/* from above or from radix_tree_insert */
1847		goto repeat;
1848	return error;
1849}
1850
1851/*
1852 * This is like autoremove_wake_function, but it removes the wait queue
1853 * entry unconditionally - even if something else had already woken the
1854 * target.
1855 */
1856static int synchronous_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
1857{
1858	int ret = default_wake_function(wait, mode, sync, key);
1859	list_del_init(&wait->task_list);
1860	return ret;
1861}
1862
1863static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1864{
1865	struct inode *inode = file_inode(vma->vm_file);
1866	gfp_t gfp = mapping_gfp_mask(inode->i_mapping);
1867	enum sgp_type sgp;
1868	int error;
1869	int ret = VM_FAULT_LOCKED;
1870
1871	/*
1872	 * Trinity finds that probing a hole which tmpfs is punching can
1873	 * prevent the hole-punch from ever completing: which in turn
1874	 * locks writers out with its hold on i_mutex.  So refrain from
1875	 * faulting pages into the hole while it's being punched.  Although
1876	 * shmem_undo_range() does remove the additions, it may be unable to
1877	 * keep up, as each new page needs its own unmap_mapping_range() call,
1878	 * and the i_mmap tree grows ever slower to scan if new vmas are added.
1879	 *
1880	 * It does not matter if we sometimes reach this check just before the
1881	 * hole-punch begins, so that one fault then races with the punch:
1882	 * we just need to make racing faults a rare case.
1883	 *
1884	 * The implementation below would be much simpler if we just used a
1885	 * standard mutex or completion: but we cannot take i_mutex in fault,
1886	 * and bloating every shmem inode for this unlikely case would be sad.
1887	 */
1888	if (unlikely(inode->i_private)) {
1889		struct shmem_falloc *shmem_falloc;
1890
1891		spin_lock(&inode->i_lock);
1892		shmem_falloc = inode->i_private;
1893		if (shmem_falloc &&
1894		    shmem_falloc->waitq &&
1895		    vmf->pgoff >= shmem_falloc->start &&
1896		    vmf->pgoff < shmem_falloc->next) {
1897			wait_queue_head_t *shmem_falloc_waitq;
1898			DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function);
1899
1900			ret = VM_FAULT_NOPAGE;
1901			if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) &&
1902			   !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) {
1903				/* It's polite to up mmap_sem if we can */
1904				up_read(&vma->vm_mm->mmap_sem);
1905				ret = VM_FAULT_RETRY;
1906			}
1907
1908			shmem_falloc_waitq = shmem_falloc->waitq;
1909			prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait,
1910					TASK_UNINTERRUPTIBLE);
1911			spin_unlock(&inode->i_lock);
1912			schedule();
1913
1914			/*
1915			 * shmem_falloc_waitq points into the shmem_fallocate()
1916			 * stack of the hole-punching task: shmem_falloc_waitq
1917			 * is usually invalid by the time we reach here, but
1918			 * finish_wait() does not dereference it in that case;
1919			 * though i_lock needed lest racing with wake_up_all().
1920			 */
1921			spin_lock(&inode->i_lock);
1922			finish_wait(shmem_falloc_waitq, &shmem_fault_wait);
1923			spin_unlock(&inode->i_lock);
1924			return ret;
1925		}
1926		spin_unlock(&inode->i_lock);
1927	}
1928
1929	sgp = SGP_CACHE;
1930	if (vma->vm_flags & VM_HUGEPAGE)
1931		sgp = SGP_HUGE;
1932	else if (vma->vm_flags & VM_NOHUGEPAGE)
1933		sgp = SGP_NOHUGE;
1934
1935	error = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, sgp,
1936				  gfp, vma->vm_mm, &ret);
1937	if (error)
1938		return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
1939	return ret;
1940}
1941
1942unsigned long shmem_get_unmapped_area(struct file *file,
1943				      unsigned long uaddr, unsigned long len,
1944				      unsigned long pgoff, unsigned long flags)
1945{
1946	unsigned long (*get_area)(struct file *,
1947		unsigned long, unsigned long, unsigned long, unsigned long);
1948	unsigned long addr;
1949	unsigned long offset;
1950	unsigned long inflated_len;
1951	unsigned long inflated_addr;
1952	unsigned long inflated_offset;
1953
1954	if (len > TASK_SIZE)
1955		return -ENOMEM;
1956
1957	get_area = current->mm->get_unmapped_area;
1958	addr = get_area(file, uaddr, len, pgoff, flags);
1959
1960	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
1961		return addr;
1962	if (IS_ERR_VALUE(addr))
1963		return addr;
1964	if (addr & ~PAGE_MASK)
1965		return addr;
1966	if (addr > TASK_SIZE - len)
1967		return addr;
1968
1969	if (shmem_huge == SHMEM_HUGE_DENY)
1970		return addr;
1971	if (len < HPAGE_PMD_SIZE)
1972		return addr;
1973	if (flags & MAP_FIXED)
1974		return addr;
1975	/*
1976	 * Our priority is to support MAP_SHARED mapped hugely;
1977	 * and support MAP_PRIVATE mapped hugely too, until it is COWed.
1978	 * But if caller specified an address hint, respect that as before.
1979	 */
1980	if (uaddr)
1981		return addr;
1982
1983	if (shmem_huge != SHMEM_HUGE_FORCE) {
1984		struct super_block *sb;
1985
1986		if (file) {
1987			VM_BUG_ON(file->f_op != &shmem_file_operations);
1988			sb = file_inode(file)->i_sb;
1989		} else {
1990			/*
1991			 * Called directly from mm/mmap.c, or drivers/char/mem.c
1992			 * for "/dev/zero", to create a shared anonymous object.
1993			 */
1994			if (IS_ERR(shm_mnt))
1995				return addr;
1996			sb = shm_mnt->mnt_sb;
1997		}
1998		if (SHMEM_SB(sb)->huge == SHMEM_HUGE_NEVER)
1999			return addr;
2000	}
2001
2002	offset = (pgoff << PAGE_SHIFT) & (HPAGE_PMD_SIZE-1);
2003	if (offset && offset + len < 2 * HPAGE_PMD_SIZE)
2004		return addr;
2005	if ((addr & (HPAGE_PMD_SIZE-1)) == offset)
2006		return addr;
2007
2008	inflated_len = len + HPAGE_PMD_SIZE - PAGE_SIZE;
2009	if (inflated_len > TASK_SIZE)
2010		return addr;
2011	if (inflated_len < len)
2012		return addr;
2013
2014	inflated_addr = get_area(NULL, 0, inflated_len, 0, flags);
2015	if (IS_ERR_VALUE(inflated_addr))
2016		return addr;
2017	if (inflated_addr & ~PAGE_MASK)
2018		return addr;
2019
2020	inflated_offset = inflated_addr & (HPAGE_PMD_SIZE-1);
2021	inflated_addr += offset - inflated_offset;
2022	if (inflated_offset > offset)
2023		inflated_addr += HPAGE_PMD_SIZE;
2024
2025	if (inflated_addr > TASK_SIZE - len)
2026		return addr;
2027	return inflated_addr;
2028}
2029
2030#ifdef CONFIG_NUMA
2031static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
2032{
2033	struct inode *inode = file_inode(vma->vm_file);
2034	return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol);
2035}
2036
2037static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
2038					  unsigned long addr)
2039{
2040	struct inode *inode = file_inode(vma->vm_file);
2041	pgoff_t index;
2042
2043	index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2044	return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index);
2045}
2046#endif
2047
2048int shmem_lock(struct file *file, int lock, struct user_struct *user)
2049{
2050	struct inode *inode = file_inode(file);
2051	struct shmem_inode_info *info = SHMEM_I(inode);
2052	int retval = -ENOMEM;
2053
2054	spin_lock_irq(&info->lock);
2055	if (lock && !(info->flags & VM_LOCKED)) {
2056		if (!user_shm_lock(inode->i_size, user))
2057			goto out_nomem;
2058		info->flags |= VM_LOCKED;
2059		mapping_set_unevictable(file->f_mapping);
2060	}
2061	if (!lock && (info->flags & VM_LOCKED) && user) {
2062		user_shm_unlock(inode->i_size, user);
2063		info->flags &= ~VM_LOCKED;
2064		mapping_clear_unevictable(file->f_mapping);
2065	}
2066	retval = 0;
2067
2068out_nomem:
2069	spin_unlock_irq(&info->lock);
2070	return retval;
2071}
2072
2073static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
2074{
2075	file_accessed(file);
2076	vma->vm_ops = &shmem_vm_ops;
2077	if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &&
2078			((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) <
2079			(vma->vm_end & HPAGE_PMD_MASK)) {
2080		khugepaged_enter(vma, vma->vm_flags);
2081	}
2082	return 0;
2083}
2084
2085static struct inode *shmem_get_inode(struct super_block *sb, const struct inode *dir,
2086				     umode_t mode, dev_t dev, unsigned long flags)
2087{
2088	struct inode *inode;
2089	struct shmem_inode_info *info;
2090	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
2091
2092	if (shmem_reserve_inode(sb))
2093		return NULL;
2094
2095	inode = new_inode(sb);
2096	if (inode) {
2097		inode->i_ino = get_next_ino();
2098		inode_init_owner(inode, dir, mode);
2099		inode->i_blocks = 0;
2100		inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
2101		inode->i_generation = get_seconds();
2102		info = SHMEM_I(inode);
2103		memset(info, 0, (char *)inode - (char *)info);
2104		spin_lock_init(&info->lock);
2105		info->seals = F_SEAL_SEAL;
2106		info->flags = flags & VM_NORESERVE;
2107		INIT_LIST_HEAD(&info->shrinklist);
2108		INIT_LIST_HEAD(&info->swaplist);
2109		simple_xattrs_init(&info->xattrs);
2110		cache_no_acl(inode);
2111
2112		switch (mode & S_IFMT) {
2113		default:
2114			inode->i_op = &shmem_special_inode_operations;
2115			init_special_inode(inode, mode, dev);
2116			break;
2117		case S_IFREG:
2118			inode->i_mapping->a_ops = &shmem_aops;
2119			inode->i_op = &shmem_inode_operations;
2120			inode->i_fop = &shmem_file_operations;
2121			mpol_shared_policy_init(&info->policy,
2122						 shmem_get_sbmpol(sbinfo));
2123			break;
2124		case S_IFDIR:
2125			inc_nlink(inode);
2126			/* Some things misbehave if size == 0 on a directory */
2127			inode->i_size = 2 * BOGO_DIRENT_SIZE;
2128			inode->i_op = &shmem_dir_inode_operations;
2129			inode->i_fop = &simple_dir_operations;
2130			break;
2131		case S_IFLNK:
2132			/*
2133			 * Must not load anything in the rbtree,
2134			 * mpol_free_shared_policy will not be called.
2135			 */
2136			mpol_shared_policy_init(&info->policy, NULL);
2137			break;
2138		}
2139	} else
2140		shmem_free_inode(sb);
2141	return inode;
2142}
2143
2144bool shmem_mapping(struct address_space *mapping)
2145{
2146	if (!mapping->host)
2147		return false;
2148
2149	return mapping->host->i_sb->s_op == &shmem_ops;
2150}
2151
2152#ifdef CONFIG_TMPFS
2153static const struct inode_operations shmem_symlink_inode_operations;
2154static const struct inode_operations shmem_short_symlink_operations;
2155
2156#ifdef CONFIG_TMPFS_XATTR
2157static int shmem_initxattrs(struct inode *, const struct xattr *, void *);
2158#else
2159#define shmem_initxattrs NULL
2160#endif
2161
2162static int
2163shmem_write_begin(struct file *file, struct address_space *mapping,
2164			loff_t pos, unsigned len, unsigned flags,
2165			struct page **pagep, void **fsdata)
2166{
2167	struct inode *inode = mapping->host;
2168	struct shmem_inode_info *info = SHMEM_I(inode);
2169	pgoff_t index = pos >> PAGE_SHIFT;
2170
2171	/* i_mutex is held by caller */
2172	if (unlikely(info->seals)) {
2173		if (info->seals & F_SEAL_WRITE)
2174			return -EPERM;
2175		if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size)
2176			return -EPERM;
2177	}
2178
2179	return shmem_getpage(inode, index, pagep, SGP_WRITE);
2180}
2181
2182static int
2183shmem_write_end(struct file *file, struct address_space *mapping,
2184			loff_t pos, unsigned len, unsigned copied,
2185			struct page *page, void *fsdata)
2186{
2187	struct inode *inode = mapping->host;
2188
2189	if (pos + copied > inode->i_size)
2190		i_size_write(inode, pos + copied);
2191
2192	if (!PageUptodate(page)) {
2193		struct page *head = compound_head(page);
2194		if (PageTransCompound(page)) {
2195			int i;
2196
2197			for (i = 0; i < HPAGE_PMD_NR; i++) {
2198				if (head + i == page)
2199					continue;
2200				clear_highpage(head + i);
2201				flush_dcache_page(head + i);
2202			}
2203		}
2204		if (copied < PAGE_SIZE) {
2205			unsigned from = pos & (PAGE_SIZE - 1);
2206			zero_user_segments(page, 0, from,
2207					from + copied, PAGE_SIZE);
2208		}
2209		SetPageUptodate(head);
2210	}
2211	set_page_dirty(page);
2212	unlock_page(page);
2213	put_page(page);
2214
2215	return copied;
2216}
2217
2218static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
2219{
2220	struct file *file = iocb->ki_filp;
2221	struct inode *inode = file_inode(file);
2222	struct address_space *mapping = inode->i_mapping;
2223	pgoff_t index;
2224	unsigned long offset;
2225	enum sgp_type sgp = SGP_READ;
2226	int error = 0;
2227	ssize_t retval = 0;
2228	loff_t *ppos = &iocb->ki_pos;
2229
2230	/*
2231	 * Might this read be for a stacking filesystem?  Then when reading
2232	 * holes of a sparse file, we actually need to allocate those pages,
2233	 * and even mark them dirty, so it cannot exceed the max_blocks limit.
2234	 */
2235	if (!iter_is_iovec(to))
2236		sgp = SGP_CACHE;
2237
2238	index = *ppos >> PAGE_SHIFT;
2239	offset = *ppos & ~PAGE_MASK;
2240
2241	for (;;) {
2242		struct page *page = NULL;
2243		pgoff_t end_index;
2244		unsigned long nr, ret;
2245		loff_t i_size = i_size_read(inode);
2246
2247		end_index = i_size >> PAGE_SHIFT;
2248		if (index > end_index)
2249			break;
2250		if (index == end_index) {
2251			nr = i_size & ~PAGE_MASK;
2252			if (nr <= offset)
2253				break;
2254		}
2255
2256		error = shmem_getpage(inode, index, &page, sgp);
2257		if (error) {
2258			if (error == -EINVAL)
2259				error = 0;
2260			break;
2261		}
2262		if (page) {
2263			if (sgp == SGP_CACHE)
2264				set_page_dirty(page);
2265			unlock_page(page);
2266		}
2267
2268		/*
2269		 * We must evaluate after, since reads (unlike writes)
2270		 * are called without i_mutex protection against truncate
2271		 */
2272		nr = PAGE_SIZE;
2273		i_size = i_size_read(inode);
2274		end_index = i_size >> PAGE_SHIFT;
2275		if (index == end_index) {
2276			nr = i_size & ~PAGE_MASK;
2277			if (nr <= offset) {
2278				if (page)
2279					put_page(page);
2280				break;
2281			}
2282		}
2283		nr -= offset;
2284
2285		if (page) {
2286			/*
2287			 * If users can be writing to this page using arbitrary
2288			 * virtual addresses, take care about potential aliasing
2289			 * before reading the page on the kernel side.
2290			 */
2291			if (mapping_writably_mapped(mapping))
2292				flush_dcache_page(page);
2293			/*
2294			 * Mark the page accessed if we read the beginning.
2295			 */
2296			if (!offset)
2297				mark_page_accessed(page);
2298		} else {
2299			page = ZERO_PAGE(0);
2300			get_page(page);
2301		}
2302
2303		/*
2304		 * Ok, we have the page, and it's up-to-date, so
2305		 * now we can copy it to user space...
2306		 */
2307		ret = copy_page_to_iter(page, offset, nr, to);
2308		retval += ret;
2309		offset += ret;
2310		index += offset >> PAGE_SHIFT;
2311		offset &= ~PAGE_MASK;
2312
2313		put_page(page);
2314		if (!iov_iter_count(to))
2315			break;
2316		if (ret < nr) {
2317			error = -EFAULT;
2318			break;
2319		}
2320		cond_resched();
2321	}
2322
2323	*ppos = ((loff_t) index << PAGE_SHIFT) + offset;
2324	file_accessed(file);
2325	return retval ? retval : error;
2326}
2327
2328/*
2329 * llseek SEEK_DATA or SEEK_HOLE through the radix_tree.
2330 */
2331static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
2332				    pgoff_t index, pgoff_t end, int whence)
2333{
2334	struct page *page;
2335	struct pagevec pvec;
2336	pgoff_t indices[PAGEVEC_SIZE];
2337	bool done = false;
2338	int i;
2339
2340	pagevec_init(&pvec, 0);
2341	pvec.nr = 1;		/* start small: we may be there already */
2342	while (!done) {
2343		pvec.nr = find_get_entries(mapping, index,
2344					pvec.nr, pvec.pages, indices);
2345		if (!pvec.nr) {
2346			if (whence == SEEK_DATA)
2347				index = end;
2348			break;
2349		}
2350		for (i = 0; i < pvec.nr; i++, index++) {
2351			if (index < indices[i]) {
2352				if (whence == SEEK_HOLE) {
2353					done = true;
2354					break;
2355				}
2356				index = indices[i];
2357			}
2358			page = pvec.pages[i];
2359			if (page && !radix_tree_exceptional_entry(page)) {
2360				if (!PageUptodate(page))
2361					page = NULL;
2362			}
2363			if (index >= end ||
2364			    (page && whence == SEEK_DATA) ||
2365			    (!page && whence == SEEK_HOLE)) {
2366				done = true;
2367				break;
2368			}
2369		}
2370		pagevec_remove_exceptionals(&pvec);
2371		pagevec_release(&pvec);
2372		pvec.nr = PAGEVEC_SIZE;
2373		cond_resched();
2374	}
2375	return index;
2376}
2377
2378static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
2379{
2380	struct address_space *mapping = file->f_mapping;
2381	struct inode *inode = mapping->host;
2382	pgoff_t start, end;
2383	loff_t new_offset;
2384
2385	if (whence != SEEK_DATA && whence != SEEK_HOLE)
2386		return generic_file_llseek_size(file, offset, whence,
2387					MAX_LFS_FILESIZE, i_size_read(inode));
2388	inode_lock(inode);
2389	/* We're holding i_mutex so we can access i_size directly */
2390
2391	if (offset < 0)
2392		offset = -EINVAL;
2393	else if (offset >= inode->i_size)
2394		offset = -ENXIO;
2395	else {
2396		start = offset >> PAGE_SHIFT;
2397		end = (inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
2398		new_offset = shmem_seek_hole_data(mapping, start, end, whence);
2399		new_offset <<= PAGE_SHIFT;
2400		if (new_offset > offset) {
2401			if (new_offset < inode->i_size)
2402				offset = new_offset;
2403			else if (whence == SEEK_DATA)
2404				offset = -ENXIO;
2405			else
2406				offset = inode->i_size;
2407		}
2408	}
2409
2410	if (offset >= 0)
2411		offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE);
2412	inode_unlock(inode);
2413	return offset;
2414}
2415
2416/*
2417 * We need a tag: a new tag would expand every radix_tree_node by 8 bytes,
2418 * so reuse a tag which we firmly believe is never set or cleared on shmem.
2419 */
2420#define SHMEM_TAG_PINNED        PAGECACHE_TAG_TOWRITE
2421#define LAST_SCAN               4       /* about 150ms max */
2422
2423static void shmem_tag_pins(struct address_space *mapping)
2424{
2425	struct radix_tree_iter iter;
2426	void **slot;
2427	pgoff_t start;
2428	struct page *page;
2429
2430	lru_add_drain();
2431	start = 0;
2432	rcu_read_lock();
2433
2434	radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
2435		page = radix_tree_deref_slot(slot);
2436		if (!page || radix_tree_exception(page)) {
2437			if (radix_tree_deref_retry(page)) {
2438				slot = radix_tree_iter_retry(&iter);
2439				continue;
2440			}
2441		} else if (page_count(page) - page_mapcount(page) > 1) {
2442			spin_lock_irq(&mapping->tree_lock);
2443			radix_tree_tag_set(&mapping->page_tree, iter.index,
2444					   SHMEM_TAG_PINNED);
2445			spin_unlock_irq(&mapping->tree_lock);
2446		}
2447
2448		if (need_resched()) {
2449			cond_resched_rcu();
2450			slot = radix_tree_iter_next(&iter);
2451		}
2452	}
2453	rcu_read_unlock();
2454}
2455
2456/*
2457 * Setting SEAL_WRITE requires us to verify there's no pending writer. However,
2458 * via get_user_pages(), drivers might have some pending I/O without any active
2459 * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all pages
2460 * and see whether it has an elevated ref-count. If so, we tag them and wait for
2461 * them to be dropped.
2462 * The caller must guarantee that no new user will acquire writable references
2463 * to those pages to avoid races.
2464 */
2465static int shmem_wait_for_pins(struct address_space *mapping)
2466{
2467	struct radix_tree_iter iter;
2468	void **slot;
2469	pgoff_t start;
2470	struct page *page;
2471	int error, scan;
2472
2473	shmem_tag_pins(mapping);
2474
2475	error = 0;
2476	for (scan = 0; scan <= LAST_SCAN; scan++) {
2477		if (!radix_tree_tagged(&mapping->page_tree, SHMEM_TAG_PINNED))
2478			break;
2479
2480		if (!scan)
2481			lru_add_drain_all();
2482		else if (schedule_timeout_killable((HZ << scan) / 200))
2483			scan = LAST_SCAN;
2484
2485		start = 0;
2486		rcu_read_lock();
2487		radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter,
2488					   start, SHMEM_TAG_PINNED) {
2489
2490			page = radix_tree_deref_slot(slot);
2491			if (radix_tree_exception(page)) {
2492				if (radix_tree_deref_retry(page)) {
2493					slot = radix_tree_iter_retry(&iter);
2494					continue;
2495				}
2496
2497				page = NULL;
2498			}
2499
2500			if (page &&
2501			    page_count(page) - page_mapcount(page) != 1) {
2502				if (scan < LAST_SCAN)
2503					goto continue_resched;
2504
2505				/*
2506				 * On the last scan, we clean up all those tags
2507				 * we inserted; but make a note that we still
2508				 * found pages pinned.
2509				 */
2510				error = -EBUSY;
2511			}
2512
2513			spin_lock_irq(&mapping->tree_lock);
2514			radix_tree_tag_clear(&mapping->page_tree,
2515					     iter.index, SHMEM_TAG_PINNED);
2516			spin_unlock_irq(&mapping->tree_lock);
2517continue_resched:
2518			if (need_resched()) {
2519				cond_resched_rcu();
2520				slot = radix_tree_iter_next(&iter);
2521			}
2522		}
2523		rcu_read_unlock();
2524	}
2525
2526	return error;
2527}
2528
2529#define F_ALL_SEALS (F_SEAL_SEAL | \
2530		     F_SEAL_SHRINK | \
2531		     F_SEAL_GROW | \
2532		     F_SEAL_WRITE)
2533
2534int shmem_add_seals(struct file *file, unsigned int seals)
2535{
2536	struct inode *inode = file_inode(file);
2537	struct shmem_inode_info *info = SHMEM_I(inode);
2538	int error;
2539
2540	/*
2541	 * SEALING
2542	 * Sealing allows multiple parties to share a shmem-file but restrict
2543	 * access to a specific subset of file operations. Seals can only be
2544	 * added, but never removed. This way, mutually untrusted parties can
2545	 * share common memory regions with a well-defined policy. A malicious
2546	 * peer can thus never perform unwanted operations on a shared object.
2547	 *
2548	 * Seals are only supported on special shmem-files and always affect
2549	 * the whole underlying inode. Once a seal is set, it may prevent some
2550	 * kinds of access to the file. Currently, the following seals are
2551	 * defined:
2552	 *   SEAL_SEAL: Prevent further seals from being set on this file
2553	 *   SEAL_SHRINK: Prevent the file from shrinking
2554	 *   SEAL_GROW: Prevent the file from growing
2555	 *   SEAL_WRITE: Prevent write access to the file
2556	 *
2557	 * As we don't require any trust relationship between two parties, we
2558	 * must prevent seals from being removed. Therefore, sealing a file
2559	 * only adds a given set of seals to the file, it never touches
2560	 * existing seals. Furthermore, the "setting seals"-operation can be
2561	 * sealed itself, which basically prevents any further seal from being
2562	 * added.
2563	 *
2564	 * Semantics of sealing are only defined on volatile files. Only
2565	 * anonymous shmem files support sealing. More importantly, seals are
2566	 * never written to disk. Therefore, there's no plan to support it on
2567	 * other file types.
2568	 */
2569
2570	if (file->f_op != &shmem_file_operations)
2571		return -EINVAL;
2572	if (!(file->f_mode & FMODE_WRITE))
2573		return -EPERM;
2574	if (seals & ~(unsigned int)F_ALL_SEALS)
2575		return -EINVAL;
2576
2577	inode_lock(inode);
2578
2579	if (info->seals & F_SEAL_SEAL) {
2580		error = -EPERM;
2581		goto unlock;
2582	}
2583
2584	if ((seals & F_SEAL_WRITE) && !(info->seals & F_SEAL_WRITE)) {
2585		error = mapping_deny_writable(file->f_mapping);
2586		if (error)
2587			goto unlock;
2588
2589		error = shmem_wait_for_pins(file->f_mapping);
2590		if (error) {
2591			mapping_allow_writable(file->f_mapping);
2592			goto unlock;
2593		}
2594	}
2595
2596	info->seals |= seals;
2597	error = 0;
2598
2599unlock:
2600	inode_unlock(inode);
2601	return error;
2602}
2603EXPORT_SYMBOL_GPL(shmem_add_seals);
2604
2605int shmem_get_seals(struct file *file)
2606{
2607	if (file->f_op != &shmem_file_operations)
2608		return -EINVAL;
2609
2610	return SHMEM_I(file_inode(file))->seals;
2611}
2612EXPORT_SYMBOL_GPL(shmem_get_seals);
2613
2614long shmem_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
2615{
2616	long error;
2617
2618	switch (cmd) {
2619	case F_ADD_SEALS:
2620		/* disallow upper 32bit */
2621		if (arg > UINT_MAX)
2622			return -EINVAL;
2623
2624		error = shmem_add_seals(file, arg);
2625		break;
2626	case F_GET_SEALS:
2627		error = shmem_get_seals(file);
2628		break;
2629	default:
2630		error = -EINVAL;
2631		break;
2632	}
2633
2634	return error;
2635}
2636
2637static long shmem_fallocate(struct file *file, int mode, loff_t offset,
2638							 loff_t len)
2639{
2640	struct inode *inode = file_inode(file);
2641	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
2642	struct shmem_inode_info *info = SHMEM_I(inode);
2643	struct shmem_falloc shmem_falloc;
2644	pgoff_t start, index, end;
2645	int error;
2646
2647	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
2648		return -EOPNOTSUPP;
2649
2650	inode_lock(inode);
2651
2652	if (mode & FALLOC_FL_PUNCH_HOLE) {
2653		struct address_space *mapping = file->f_mapping;
2654		loff_t unmap_start = round_up(offset, PAGE_SIZE);
2655		loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1;
2656		DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq);
2657
2658		/* protected by i_mutex */
2659		if (info->seals & F_SEAL_WRITE) {
2660			error = -EPERM;
2661			goto out;
2662		}
2663
2664		shmem_falloc.waitq = &shmem_falloc_waitq;
2665		shmem_falloc.start = unmap_start >> PAGE_SHIFT;
2666		shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT;
2667		spin_lock(&inode->i_lock);
2668		inode->i_private = &shmem_falloc;
2669		spin_unlock(&inode->i_lock);
2670
2671		if ((u64)unmap_end > (u64)unmap_start)
2672			unmap_mapping_range(mapping, unmap_start,
2673					    1 + unmap_end - unmap_start, 0);
2674		shmem_truncate_range(inode, offset, offset + len - 1);
2675		/* No need to unmap again: hole-punching leaves COWed pages */
2676
2677		spin_lock(&inode->i_lock);
2678		inode->i_private = NULL;
2679		wake_up_all(&shmem_falloc_waitq);
2680		WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.task_list));
2681		spin_unlock(&inode->i_lock);
2682		error = 0;
2683		goto out;
2684	}
2685
2686	/* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
2687	error = inode_newsize_ok(inode, offset + len);
2688	if (error)
2689		goto out;
2690
2691	if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) {
2692		error = -EPERM;
2693		goto out;
2694	}
2695
2696	start = offset >> PAGE_SHIFT;
2697	end = (offset + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
2698	/* Try to avoid a swapstorm if len is impossible to satisfy */
2699	if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) {
2700		error = -ENOSPC;
2701		goto out;
2702	}
2703
2704	shmem_falloc.waitq = NULL;
2705	shmem_falloc.start = start;
2706	shmem_falloc.next  = start;
2707	shmem_falloc.nr_falloced = 0;
2708	shmem_falloc.nr_unswapped = 0;
2709	spin_lock(&inode->i_lock);
2710	inode->i_private = &shmem_falloc;
2711	spin_unlock(&inode->i_lock);
2712
2713	for (index = start; index < end; index++) {
2714		struct page *page;
2715
2716		/*
2717		 * Good, the fallocate(2) manpage permits EINTR: we may have
2718		 * been interrupted because we are using up too much memory.
2719		 */
2720		if (signal_pending(current))
2721			error = -EINTR;
2722		else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced)
2723			error = -ENOMEM;
2724		else
2725			error = shmem_getpage(inode, index, &page, SGP_FALLOC);
2726		if (error) {
2727			/* Remove the !PageUptodate pages we added */
2728			if (index > start) {
2729				shmem_undo_range(inode,
2730				    (loff_t)start << PAGE_SHIFT,
2731				    ((loff_t)index << PAGE_SHIFT) - 1, true);
2732			}
2733			goto undone;
2734		}
2735
2736		/*
2737		 * Inform shmem_writepage() how far we have reached.
2738		 * No need for lock or barrier: we have the page lock.
2739		 */
2740		shmem_falloc.next++;
2741		if (!PageUptodate(page))
2742			shmem_falloc.nr_falloced++;
2743
2744		/*
2745		 * If !PageUptodate, leave it that way so that freeable pages
2746		 * can be recognized if we need to rollback on error later.
2747		 * But set_page_dirty so that memory pressure will swap rather
2748		 * than free the pages we are allocating (and SGP_CACHE pages
2749		 * might still be clean: we now need to mark those dirty too).
2750		 */
2751		set_page_dirty(page);
2752		unlock_page(page);
2753		put_page(page);
2754		cond_resched();
2755	}
2756
2757	if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
2758		i_size_write(inode, offset + len);
2759	inode->i_ctime = current_time(inode);
2760undone:
2761	spin_lock(&inode->i_lock);
2762	inode->i_private = NULL;
2763	spin_unlock(&inode->i_lock);
2764out:
2765	inode_unlock(inode);
2766	return error;
2767}
2768
2769static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
2770{
2771	struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
2772
2773	buf->f_type = TMPFS_MAGIC;
2774	buf->f_bsize = PAGE_SIZE;
2775	buf->f_namelen = NAME_MAX;
2776	if (sbinfo->max_blocks) {
2777		buf->f_blocks = sbinfo->max_blocks;
2778		buf->f_bavail =
2779		buf->f_bfree  = sbinfo->max_blocks -
2780				percpu_counter_sum(&sbinfo->used_blocks);
2781	}
2782	if (sbinfo->max_inodes) {
2783		buf->f_files = sbinfo->max_inodes;
2784		buf->f_ffree = sbinfo->free_inodes;
2785	}
2786	/* else leave those fields 0 like simple_statfs */
2787	return 0;
2788}
2789
2790/*
2791 * File creation. Allocate an inode, and we're done..
2792 */
2793static int
2794shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
2795{
2796	struct inode *inode;
2797	int error = -ENOSPC;
2798
2799	inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE);
2800	if (inode) {
2801		error = simple_acl_create(dir, inode);
2802		if (error)
2803			goto out_iput;
2804		error = security_inode_init_security(inode, dir,
2805						     &dentry->d_name,
2806						     shmem_initxattrs, NULL);
2807		if (error && error != -EOPNOTSUPP)
2808			goto out_iput;
2809
2810		error = 0;
2811		dir->i_size += BOGO_DIRENT_SIZE;
2812		dir->i_ctime = dir->i_mtime = current_time(dir);
2813		d_instantiate(dentry, inode);
2814		dget(dentry); /* Extra count - pin the dentry in core */
2815	}
2816	return error;
2817out_iput:
2818	iput(inode);
2819	return error;
2820}
2821
2822static int
2823shmem_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
2824{
2825	struct inode *inode;
2826	int error = -ENOSPC;
2827
2828	inode = shmem_get_inode(dir->i_sb, dir, mode, 0, VM_NORESERVE);
2829	if (inode) {
2830		error = security_inode_init_security(inode, dir,
2831						     NULL,
2832						     shmem_initxattrs, NULL);
2833		if (error && error != -EOPNOTSUPP)
2834			goto out_iput;
2835		error = simple_acl_create(dir, inode);
2836		if (error)
2837			goto out_iput;
2838		d_tmpfile(dentry, inode);
2839	}
2840	return error;
2841out_iput:
2842	iput(inode);
2843	return error;
2844}
2845
2846static int shmem_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
2847{
2848	int error;
2849
2850	if ((error = shmem_mknod(dir, dentry, mode | S_IFDIR, 0)))
2851		return error;
2852	inc_nlink(dir);
2853	return 0;
2854}
2855
2856static int shmem_create(struct inode *dir, struct dentry *dentry, umode_t mode,
2857		bool excl)
2858{
2859	return shmem_mknod(dir, dentry, mode | S_IFREG, 0);
2860}
2861
2862/*
2863 * Link a file..
2864 */
2865static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
2866{
2867	struct inode *inode = d_inode(old_dentry);
2868	int ret;
2869
2870	/*
2871	 * No ordinary (disk based) filesystem counts links as inodes;
2872	 * but each new link needs a new dentry, pinning lowmem, and
2873	 * tmpfs dentries cannot be pruned until they are unlinked.
2874	 */
2875	ret = shmem_reserve_inode(inode->i_sb);
2876	if (ret)
2877		goto out;
2878
2879	dir->i_size += BOGO_DIRENT_SIZE;
2880	inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
2881	inc_nlink(inode);
2882	ihold(inode);	/* New dentry reference */
2883	dget(dentry);		/* Extra pinning count for the created dentry */
2884	d_instantiate(dentry, inode);
2885out:
2886	return ret;
2887}
2888
2889static int shmem_unlink(struct inode *dir, struct dentry *dentry)
2890{
2891	struct inode *inode = d_inode(dentry);
2892
2893	if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode))
2894		shmem_free_inode(inode->i_sb);
2895
2896	dir->i_size -= BOGO_DIRENT_SIZE;
2897	inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
2898	drop_nlink(inode);
2899	dput(dentry);	/* Undo the count from "create" - this does all the work */
2900	return 0;
2901}
2902
2903static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
2904{
2905	if (!simple_empty(dentry))
2906		return -ENOTEMPTY;
2907
2908	drop_nlink(d_inode(dentry));
2909	drop_nlink(dir);
2910	return shmem_unlink(dir, dentry);
2911}
2912
2913static int shmem_exchange(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry)
2914{
2915	bool old_is_dir = d_is_dir(old_dentry);
2916	bool new_is_dir = d_is_dir(new_dentry);
2917
2918	if (old_dir != new_dir && old_is_dir != new_is_dir) {
2919		if (old_is_dir) {
2920			drop_nlink(old_dir);
2921			inc_nlink(new_dir);
2922		} else {
2923			drop_nlink(new_dir);
2924			inc_nlink(old_dir);
2925		}
2926	}
2927	old_dir->i_ctime = old_dir->i_mtime =
2928	new_dir->i_ctime = new_dir->i_mtime =
2929	d_inode(old_dentry)->i_ctime =
2930	d_inode(new_dentry)->i_ctime = current_time(old_dir);
2931
2932	return 0;
2933}
2934
2935static int shmem_whiteout(struct inode *old_dir, struct dentry *old_dentry)
2936{
2937	struct dentry *whiteout;
2938	int error;
2939
2940	whiteout = d_alloc(old_dentry->d_parent, &old_dentry->d_name);
2941	if (!whiteout)
2942		return -ENOMEM;
2943
2944	error = shmem_mknod(old_dir, whiteout,
2945			    S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV);
2946	dput(whiteout);
2947	if (error)
2948		return error;
2949
2950	/*
2951	 * Cheat and hash the whiteout while the old dentry is still in
2952	 * place, instead of playing games with FS_RENAME_DOES_D_MOVE.
2953	 *
2954	 * d_lookup() will consistently find one of them at this point,
2955	 * not sure which one, but that isn't even important.
2956	 */
2957	d_rehash(whiteout);
2958	return 0;
2959}
2960
2961/*
2962 * The VFS layer already does all the dentry stuff for rename,
2963 * we just have to decrement the usage count for the target if
2964 * it exists so that the VFS layer correctly free's it when it
2965 * gets overwritten.
2966 */
2967static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags)
2968{
2969	struct inode *inode = d_inode(old_dentry);
2970	int they_are_dirs = S_ISDIR(inode->i_mode);
2971
2972	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
2973		return -EINVAL;
2974
2975	if (flags & RENAME_EXCHANGE)
2976		return shmem_exchange(old_dir, old_dentry, new_dir, new_dentry);
2977
2978	if (!simple_empty(new_dentry))
2979		return -ENOTEMPTY;
2980
2981	if (flags & RENAME_WHITEOUT) {
2982		int error;
2983
2984		error = shmem_whiteout(old_dir, old_dentry);
2985		if (error)
2986			return error;
2987	}
2988
2989	if (d_really_is_positive(new_dentry)) {
2990		(void) shmem_unlink(new_dir, new_dentry);
2991		if (they_are_dirs) {
2992			drop_nlink(d_inode(new_dentry));
2993			drop_nlink(old_dir);
2994		}
2995	} else if (they_are_dirs) {
2996		drop_nlink(old_dir);
2997		inc_nlink(new_dir);
2998	}
2999
3000	old_dir->i_size -= BOGO_DIRENT_SIZE;
3001	new_dir->i_size += BOGO_DIRENT_SIZE;
3002	old_dir->i_ctime = old_dir->i_mtime =
3003	new_dir->i_ctime = new_dir->i_mtime =
3004	inode->i_ctime = current_time(old_dir);
3005	return 0;
3006}
3007
3008static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
3009{
3010	int error;
3011	int len;
3012	struct inode *inode;
3013	struct page *page;
3014	struct shmem_inode_info *info;
3015
3016	len = strlen(symname) + 1;
3017	if (len > PAGE_SIZE)
3018		return -ENAMETOOLONG;
3019
3020	inode = shmem_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0, VM_NORESERVE);
3021	if (!inode)
3022		return -ENOSPC;
3023
3024	error = security_inode_init_security(inode, dir, &dentry->d_name,
3025					     shmem_initxattrs, NULL);
3026	if (error) {
3027		if (error != -EOPNOTSUPP) {
3028			iput(inode);
3029			return error;
3030		}
3031		error = 0;
3032	}
3033
3034	info = SHMEM_I(inode);
3035	inode->i_size = len-1;
3036	if (len <= SHORT_SYMLINK_LEN) {
3037		inode->i_link = kmemdup(symname, len, GFP_KERNEL);
3038		if (!inode->i_link) {
3039			iput(inode);
3040			return -ENOMEM;
3041		}
3042		inode->i_op = &shmem_short_symlink_operations;
3043	} else {
3044		inode_nohighmem(inode);
3045		error = shmem_getpage(inode, 0, &page, SGP_WRITE);
3046		if (error) {
3047			iput(inode);
3048			return error;
3049		}
3050		inode->i_mapping->a_ops = &shmem_aops;
3051		inode->i_op = &shmem_symlink_inode_operations;
3052		memcpy(page_address(page), symname, len);
3053		SetPageUptodate(page);
3054		set_page_dirty(page);
3055		unlock_page(page);
3056		put_page(page);
3057	}
3058	dir->i_size += BOGO_DIRENT_SIZE;
3059	dir->i_ctime = dir->i_mtime = current_time(dir);
3060	d_instantiate(dentry, inode);
3061	dget(dentry);
3062	return 0;
3063}
3064
3065static void shmem_put_link(void *arg)
3066{
3067	mark_page_accessed(arg);
3068	put_page(arg);
3069}
3070
3071static const char *shmem_get_link(struct dentry *dentry,
3072				  struct inode *inode,
3073				  struct delayed_call *done)
3074{
3075	struct page *page = NULL;
3076	int error;
3077	if (!dentry) {
3078		page = find_get_page(inode->i_mapping, 0);
3079		if (!page)
3080			return ERR_PTR(-ECHILD);
3081		if (!PageUptodate(page)) {
3082			put_page(page);
3083			return ERR_PTR(-ECHILD);
3084		}
3085	} else {
3086		error = shmem_getpage(inode, 0, &page, SGP_READ);
3087		if (error)
3088			return ERR_PTR(error);
3089		unlock_page(page);
3090	}
3091	set_delayed_call(done, shmem_put_link, page);
3092	return page_address(page);
3093}
3094
3095#ifdef CONFIG_TMPFS_XATTR
3096/*
3097 * Superblocks without xattr inode operations may get some security.* xattr
3098 * support from the LSM "for free". As soon as we have any other xattrs
3099 * like ACLs, we also need to implement the security.* handlers at
3100 * filesystem level, though.
3101 */
3102
3103/*
3104 * Callback for security_inode_init_security() for acquiring xattrs.
3105 */
3106static int shmem_initxattrs(struct inode *inode,
3107			    const struct xattr *xattr_array,
3108			    void *fs_info)
3109{
3110	struct shmem_inode_info *info = SHMEM_I(inode);
3111	const struct xattr *xattr;
3112	struct simple_xattr *new_xattr;
3113	size_t len;
3114
3115	for (xattr = xattr_array; xattr->name != NULL; xattr++) {
3116		new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len);
3117		if (!new_xattr)
3118			return -ENOMEM;
3119
3120		len = strlen(xattr->name) + 1;
3121		new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len,
3122					  GFP_KERNEL);
3123		if (!new_xattr->name) {
3124			kfree(new_xattr);
3125			return -ENOMEM;
3126		}
3127
3128		memcpy(new_xattr->name, XATTR_SECURITY_PREFIX,
3129		       XATTR_SECURITY_PREFIX_LEN);
3130		memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN,
3131		       xattr->name, len);
3132
3133		simple_xattr_list_add(&info->xattrs, new_xattr);
3134	}
3135
3136	return 0;
3137}
3138
3139static int shmem_xattr_handler_get(const struct xattr_handler *handler,
3140				   struct dentry *unused, struct inode *inode,
3141				   const char *name, void *buffer, size_t size)
3142{
3143	struct shmem_inode_info *info = SHMEM_I(inode);
3144
3145	name = xattr_full_name(handler, name);
3146	return simple_xattr_get(&info->xattrs, name, buffer, size);
3147}
3148
3149static int shmem_xattr_handler_set(const struct xattr_handler *handler,
3150				   struct dentry *unused, struct inode *inode,
3151				   const char *name, const void *value,
3152				   size_t size, int flags)
3153{
3154	struct shmem_inode_info *info = SHMEM_I(inode);
3155
3156	name = xattr_full_name(handler, name);
3157	return simple_xattr_set(&info->xattrs, name, value, size, flags);
3158}
3159
3160static const struct xattr_handler shmem_security_xattr_handler = {
3161	.prefix = XATTR_SECURITY_PREFIX,
3162	.get = shmem_xattr_handler_get,
3163	.set = shmem_xattr_handler_set,
3164};
3165
3166static const struct xattr_handler shmem_trusted_xattr_handler = {
3167	.prefix = XATTR_TRUSTED_PREFIX,
3168	.get = shmem_xattr_handler_get,
3169	.set = shmem_xattr_handler_set,
3170};
3171
3172static const struct xattr_handler *shmem_xattr_handlers[] = {
3173#ifdef CONFIG_TMPFS_POSIX_ACL
3174	&posix_acl_access_xattr_handler,
3175	&posix_acl_default_xattr_handler,
3176#endif
3177	&shmem_security_xattr_handler,
3178	&shmem_trusted_xattr_handler,
3179	NULL
3180};
3181
3182static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size)
3183{
3184	struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
3185	return simple_xattr_list(d_inode(dentry), &info->xattrs, buffer, size);
3186}
3187#endif /* CONFIG_TMPFS_XATTR */
3188
3189static const struct inode_operations shmem_short_symlink_operations = {
3190	.readlink	= generic_readlink,
3191	.get_link	= simple_get_link,
3192#ifdef CONFIG_TMPFS_XATTR
3193	.listxattr	= shmem_listxattr,
3194#endif
3195};
3196
3197static const struct inode_operations shmem_symlink_inode_operations = {
3198	.readlink	= generic_readlink,
3199	.get_link	= shmem_get_link,
3200#ifdef CONFIG_TMPFS_XATTR
3201	.listxattr	= shmem_listxattr,
3202#endif
3203};
3204
3205static struct dentry *shmem_get_parent(struct dentry *child)
3206{
3207	return ERR_PTR(-ESTALE);
3208}
3209
3210static int shmem_match(struct inode *ino, void *vfh)
3211{
3212	__u32 *fh = vfh;
3213	__u64 inum = fh[2];
3214	inum = (inum << 32) | fh[1];
3215	return ino->i_ino == inum && fh[0] == ino->i_generation;
3216}
3217
3218static struct dentry *shmem_fh_to_dentry(struct super_block *sb,
3219		struct fid *fid, int fh_len, int fh_type)
3220{
3221	struct inode *inode;
3222	struct dentry *dentry = NULL;
3223	u64 inum;
3224
3225	if (fh_len < 3)
3226		return NULL;
3227
3228	inum = fid->raw[2];
3229	inum = (inum << 32) | fid->raw[1];
3230
3231	inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]),
3232			shmem_match, fid->raw);
3233	if (inode) {
3234		dentry = d_find_alias(inode);
3235		iput(inode);
3236	}
3237
3238	return dentry;
3239}
3240
3241static int shmem_encode_fh(struct inode *inode, __u32 *fh, int *len,
3242				struct inode *parent)
3243{
3244	if (*len < 3) {
3245		*len = 3;
3246		return FILEID_INVALID;
3247	}
3248
3249	if (inode_unhashed(inode)) {
3250		/* Unfortunately insert_inode_hash is not idempotent,
3251		 * so as we hash inodes here rather than at creation
3252		 * time, we need a lock to ensure we only try
3253		 * to do it once
3254		 */
3255		static DEFINE_SPINLOCK(lock);
3256		spin_lock(&lock);
3257		if (inode_unhashed(inode))
3258			__insert_inode_hash(inode,
3259					    inode->i_ino + inode->i_generation);
3260		spin_unlock(&lock);
3261	}
3262
3263	fh[0] = inode->i_generation;
3264	fh[1] = inode->i_ino;
3265	fh[2] = ((__u64)inode->i_ino) >> 32;
3266
3267	*len = 3;
3268	return 1;
3269}
3270
3271static const struct export_operations shmem_export_ops = {
3272	.get_parent     = shmem_get_parent,
3273	.encode_fh      = shmem_encode_fh,
3274	.fh_to_dentry	= shmem_fh_to_dentry,
3275};
3276
3277static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
3278			       bool remount)
3279{
3280	char *this_char, *value, *rest;
3281	struct mempolicy *mpol = NULL;
3282	uid_t uid;
3283	gid_t gid;
3284
3285	while (options != NULL) {
3286		this_char = options;
3287		for (;;) {
3288			/*
3289			 * NUL-terminate this option: unfortunately,
3290			 * mount options form a comma-separated list,
3291			 * but mpol's nodelist may also contain commas.
3292			 */
3293			options = strchr(options, ',');
3294			if (options == NULL)
3295				break;
3296			options++;
3297			if (!isdigit(*options)) {
3298				options[-1] = '\0';
3299				break;
3300			}
3301		}
3302		if (!*this_char)
3303			continue;
3304		if ((value = strchr(this_char,'=')) != NULL) {
3305			*value++ = 0;
3306		} else {
3307			pr_err("tmpfs: No value for mount option '%s'\n",
3308			       this_char);
3309			goto error;
3310		}
3311
3312		if (!strcmp(this_char,"size")) {
3313			unsigned long long size;
3314			size = memparse(value,&rest);
3315			if (*rest == '%') {
3316				size <<= PAGE_SHIFT;
3317				size *= totalram_pages;
3318				do_div(size, 100);
3319				rest++;
3320			}
3321			if (*rest)
3322				goto bad_val;
3323			sbinfo->max_blocks =
3324				DIV_ROUND_UP(size, PAGE_SIZE);
3325		} else if (!strcmp(this_char,"nr_blocks")) {
3326			sbinfo->max_blocks = memparse(value, &rest);
3327			if (*rest)
3328				goto bad_val;
3329		} else if (!strcmp(this_char,"nr_inodes")) {
3330			sbinfo->max_inodes = memparse(value, &rest);
3331			if (*rest)
3332				goto bad_val;
3333		} else if (!strcmp(this_char,"mode")) {
3334			if (remount)
3335				continue;
3336			sbinfo->mode = simple_strtoul(value, &rest, 8) & 07777;
3337			if (*rest)
3338				goto bad_val;
3339		} else if (!strcmp(this_char,"uid")) {
3340			if (remount)
3341				continue;
3342			uid = simple_strtoul(value, &rest, 0);
3343			if (*rest)
3344				goto bad_val;
3345			sbinfo->uid = make_kuid(current_user_ns(), uid);
3346			if (!uid_valid(sbinfo->uid))
3347				goto bad_val;
3348		} else if (!strcmp(this_char,"gid")) {
3349			if (remount)
3350				continue;
3351			gid = simple_strtoul(value, &rest, 0);
3352			if (*rest)
3353				goto bad_val;
3354			sbinfo->gid = make_kgid(current_user_ns(), gid);
3355			if (!gid_valid(sbinfo->gid))
3356				goto bad_val;
3357#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
3358		} else if (!strcmp(this_char, "huge")) {
3359			int huge;
3360			huge = shmem_parse_huge(value);
3361			if (huge < 0)
3362				goto bad_val;
3363			if (!has_transparent_hugepage() &&
3364					huge != SHMEM_HUGE_NEVER)
3365				goto bad_val;
3366			sbinfo->huge = huge;
3367#endif
3368#ifdef CONFIG_NUMA
3369		} else if (!strcmp(this_char,"mpol")) {
3370			mpol_put(mpol);
3371			mpol = NULL;
3372			if (mpol_parse_str(value, &mpol))
3373				goto bad_val;
3374#endif
3375		} else {
3376			pr_err("tmpfs: Bad mount option %s\n", this_char);
3377			goto error;
3378		}
3379	}
3380	sbinfo->mpol = mpol;
3381	return 0;
3382
3383bad_val:
3384	pr_err("tmpfs: Bad value '%s' for mount option '%s'\n",
3385	       value, this_char);
3386error:
3387	mpol_put(mpol);
3388	return 1;
3389
3390}
3391
3392static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
3393{
3394	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
3395	struct shmem_sb_info config = *sbinfo;
3396	unsigned long inodes;
3397	int error = -EINVAL;
3398
3399	config.mpol = NULL;
3400	if (shmem_parse_options(data, &config, true))
3401		return error;
3402
3403	spin_lock(&sbinfo->stat_lock);
3404	inodes = sbinfo->max_inodes - sbinfo->free_inodes;
3405	if (percpu_counter_compare(&sbinfo->used_blocks, config.max_blocks) > 0)
3406		goto out;
3407	if (config.max_inodes < inodes)
3408		goto out;
3409	/*
3410	 * Those tests disallow limited->unlimited while any are in use;
3411	 * but we must separately disallow unlimited->limited, because
3412	 * in that case we have no record of how much is already in use.
3413	 */
3414	if (config.max_blocks && !sbinfo->max_blocks)
3415		goto out;
3416	if (config.max_inodes && !sbinfo->max_inodes)
3417		goto out;
3418
3419	error = 0;
3420	sbinfo->huge = config.huge;
3421	sbinfo->max_blocks  = config.max_blocks;
3422	sbinfo->max_inodes  = config.max_inodes;
3423	sbinfo->free_inodes = config.max_inodes - inodes;
3424
3425	/*
3426	 * Preserve previous mempolicy unless mpol remount option was specified.
3427	 */
3428	if (config.mpol) {
3429		mpol_put(sbinfo->mpol);
3430		sbinfo->mpol = config.mpol;	/* transfers initial ref */
3431	}
3432out:
3433	spin_unlock(&sbinfo->stat_lock);
3434	return error;
3435}
3436
3437static int shmem_show_options(struct seq_file *seq, struct dentry *root)
3438{
3439	struct shmem_sb_info *sbinfo = SHMEM_SB(root->d_sb);
3440
3441	if (sbinfo->max_blocks != shmem_default_max_blocks())
3442		seq_printf(seq, ",size=%luk",
3443			sbinfo->max_blocks << (PAGE_SHIFT - 10));
3444	if (sbinfo->max_inodes != shmem_default_max_inodes())
3445		seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes);
3446	if (sbinfo->mode != (S_IRWXUGO | S_ISVTX))
3447		seq_printf(seq, ",mode=%03ho", sbinfo->mode);
3448	if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID))
3449		seq_printf(seq, ",uid=%u",
3450				from_kuid_munged(&init_user_ns, sbinfo->uid));
3451	if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID))
3452		seq_printf(seq, ",gid=%u",
3453				from_kgid_munged(&init_user_ns, sbinfo->gid));
3454#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
3455	/* Rightly or wrongly, show huge mount option unmasked by shmem_huge */
3456	if (sbinfo->huge)
3457		seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge));
3458#endif
3459	shmem_show_mpol(seq, sbinfo->mpol);
3460	return 0;
3461}
3462
3463#define MFD_NAME_PREFIX "memfd:"
3464#define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1)
3465#define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN)
3466
3467#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING)
3468
3469SYSCALL_DEFINE2(memfd_create,
3470		const char __user *, uname,
3471		unsigned int, flags)
3472{
3473	struct shmem_inode_info *info;
3474	struct file *file;
3475	int fd, error;
3476	char *name;
3477	long len;
3478
3479	if (flags & ~(unsigned int)MFD_ALL_FLAGS)
3480		return -EINVAL;
3481
3482	/* length includes terminating zero */
3483	len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1);
3484	if (len <= 0)
3485		return -EFAULT;
3486	if (len > MFD_NAME_MAX_LEN + 1)
3487		return -EINVAL;
3488
3489	name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_TEMPORARY);
3490	if (!name)
3491		return -ENOMEM;
3492
3493	strcpy(name, MFD_NAME_PREFIX);
3494	if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) {
3495		error = -EFAULT;
3496		goto err_name;
3497	}
3498
3499	/* terminating-zero may have changed after strnlen_user() returned */
3500	if (name[len + MFD_NAME_PREFIX_LEN - 1]) {
3501		error = -EFAULT;
3502		goto err_name;
3503	}
3504
3505	fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0);
3506	if (fd < 0) {
3507		error = fd;
3508		goto err_name;
3509	}
3510
3511	file = shmem_file_setup(name, 0, VM_NORESERVE);
3512	if (IS_ERR(file)) {
3513		error = PTR_ERR(file);
3514		goto err_fd;
3515	}
3516	info = SHMEM_I(file_inode(file));
3517	file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
3518	file->f_flags |= O_RDWR | O_LARGEFILE;
3519	if (flags & MFD_ALLOW_SEALING)
3520		info->seals &= ~F_SEAL_SEAL;
3521
3522	fd_install(fd, file);
3523	kfree(name);
3524	return fd;
3525
3526err_fd:
3527	put_unused_fd(fd);
3528err_name:
3529	kfree(name);
3530	return error;
3531}
3532
3533#endif /* CONFIG_TMPFS */
3534
3535static void shmem_put_super(struct super_block *sb)
3536{
3537	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
3538
3539	percpu_counter_destroy(&sbinfo->used_blocks);
3540	mpol_put(sbinfo->mpol);
3541	kfree(sbinfo);
3542	sb->s_fs_info = NULL;
3543}
3544
3545int shmem_fill_super(struct super_block *sb, void *data, int silent)
3546{
3547	struct inode *inode;
3548	struct shmem_sb_info *sbinfo;
3549	int err = -ENOMEM;
3550
3551	/* Round up to L1_CACHE_BYTES to resist false sharing */
3552	sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info),
3553				L1_CACHE_BYTES), GFP_KERNEL);
3554	if (!sbinfo)
3555		return -ENOMEM;
3556
3557	sbinfo->mode = S_IRWXUGO | S_ISVTX;
3558	sbinfo->uid = current_fsuid();
3559	sbinfo->gid = current_fsgid();
3560	sb->s_fs_info = sbinfo;
3561
3562#ifdef CONFIG_TMPFS
3563	/*
3564	 * Per default we only allow half of the physical ram per
3565	 * tmpfs instance, limiting inodes to one per page of lowmem;
3566	 * but the internal instance is left unlimited.
3567	 */
3568	if (!(sb->s_flags & MS_KERNMOUNT)) {
3569		sbinfo->max_blocks = shmem_default_max_blocks();
3570		sbinfo->max_inodes = shmem_default_max_inodes();
3571		if (shmem_parse_options(data, sbinfo, false)) {
3572			err = -EINVAL;
3573			goto failed;
3574		}
3575	} else {
3576		sb->s_flags |= MS_NOUSER;
3577	}
3578	sb->s_export_op = &shmem_export_ops;
3579	sb->s_flags |= MS_NOSEC;
3580#else
3581	sb->s_flags |= MS_NOUSER;
3582#endif
3583
3584	spin_lock_init(&sbinfo->stat_lock);
3585	if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL))
3586		goto failed;
3587	sbinfo->free_inodes = sbinfo->max_inodes;
3588	spin_lock_init(&sbinfo->shrinklist_lock);
3589	INIT_LIST_HEAD(&sbinfo->shrinklist);
3590
3591	sb->s_maxbytes = MAX_LFS_FILESIZE;
3592	sb->s_blocksize = PAGE_SIZE;
3593	sb->s_blocksize_bits = PAGE_SHIFT;
3594	sb->s_magic = TMPFS_MAGIC;
3595	sb->s_op = &shmem_ops;
3596	sb->s_time_gran = 1;
3597#ifdef CONFIG_TMPFS_XATTR
3598	sb->s_xattr = shmem_xattr_handlers;
3599#endif
3600#ifdef CONFIG_TMPFS_POSIX_ACL
3601	sb->s_flags |= MS_POSIXACL;
3602#endif
3603
3604	inode = shmem_get_inode(sb, NULL, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE);
3605	if (!inode)
3606		goto failed;
3607	inode->i_uid = sbinfo->uid;
3608	inode->i_gid = sbinfo->gid;
3609	sb->s_root = d_make_root(inode);
3610	if (!sb->s_root)
3611		goto failed;
3612	return 0;
3613
3614failed:
3615	shmem_put_super(sb);
3616	return err;
3617}
3618
3619static struct kmem_cache *shmem_inode_cachep;
3620
3621static struct inode *shmem_alloc_inode(struct super_block *sb)
3622{
3623	struct shmem_inode_info *info;
3624	info = kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL);
3625	if (!info)
3626		return NULL;
3627	return &info->vfs_inode;
3628}
3629
3630static void shmem_destroy_callback(struct rcu_head *head)
3631{
3632	struct inode *inode = container_of(head, struct inode, i_rcu);
3633	if (S_ISLNK(inode->i_mode))
3634		kfree(inode->i_link);
3635	kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
3636}
3637
3638static void shmem_destroy_inode(struct inode *inode)
3639{
3640	if (S_ISREG(inode->i_mode))
3641		mpol_free_shared_policy(&SHMEM_I(inode)->policy);
3642	call_rcu(&inode->i_rcu, shmem_destroy_callback);
3643}
3644
3645static void shmem_init_inode(void *foo)
3646{
3647	struct shmem_inode_info *info = foo;
3648	inode_init_once(&info->vfs_inode);
3649}
3650
3651static int shmem_init_inodecache(void)
3652{
3653	shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
3654				sizeof(struct shmem_inode_info),
3655				0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode);
3656	return 0;
3657}
3658
3659static void shmem_destroy_inodecache(void)
3660{
3661	kmem_cache_destroy(shmem_inode_cachep);
3662}
3663
3664static const struct address_space_operations shmem_aops = {
3665	.writepage	= shmem_writepage,
3666	.set_page_dirty	= __set_page_dirty_no_writeback,
3667#ifdef CONFIG_TMPFS
3668	.write_begin	= shmem_write_begin,
3669	.write_end	= shmem_write_end,
3670#endif
3671#ifdef CONFIG_MIGRATION
3672	.migratepage	= migrate_page,
3673#endif
3674	.error_remove_page = generic_error_remove_page,
3675};
3676
3677static const struct file_operations shmem_file_operations = {
3678	.mmap		= shmem_mmap,
3679	.get_unmapped_area = shmem_get_unmapped_area,
3680#ifdef CONFIG_TMPFS
3681	.llseek		= shmem_file_llseek,
3682	.read_iter	= shmem_file_read_iter,
3683	.write_iter	= generic_file_write_iter,
3684	.fsync		= noop_fsync,
3685	.splice_read	= generic_file_splice_read,
3686	.splice_write	= iter_file_splice_write,
3687	.fallocate	= shmem_fallocate,
3688#endif
3689};
3690
3691static const struct inode_operations shmem_inode_operations = {
3692	.getattr	= shmem_getattr,
3693	.setattr	= shmem_setattr,
3694#ifdef CONFIG_TMPFS_XATTR
3695	.listxattr	= shmem_listxattr,
3696	.set_acl	= simple_set_acl,
3697#endif
3698};
3699
3700static const struct inode_operations shmem_dir_inode_operations = {
3701#ifdef CONFIG_TMPFS
3702	.create		= shmem_create,
3703	.lookup		= simple_lookup,
3704	.link		= shmem_link,
3705	.unlink		= shmem_unlink,
3706	.symlink	= shmem_symlink,
3707	.mkdir		= shmem_mkdir,
3708	.rmdir		= shmem_rmdir,
3709	.mknod		= shmem_mknod,
3710	.rename		= shmem_rename2,
3711	.tmpfile	= shmem_tmpfile,
3712#endif
3713#ifdef CONFIG_TMPFS_XATTR
3714	.listxattr	= shmem_listxattr,
3715#endif
3716#ifdef CONFIG_TMPFS_POSIX_ACL
3717	.setattr	= shmem_setattr,
3718	.set_acl	= simple_set_acl,
3719#endif
3720};
3721
3722static const struct inode_operations shmem_special_inode_operations = {
3723#ifdef CONFIG_TMPFS_XATTR
3724	.listxattr	= shmem_listxattr,
3725#endif
3726#ifdef CONFIG_TMPFS_POSIX_ACL
3727	.setattr	= shmem_setattr,
3728	.set_acl	= simple_set_acl,
3729#endif
3730};
3731
3732static const struct super_operations shmem_ops = {
3733	.alloc_inode	= shmem_alloc_inode,
3734	.destroy_inode	= shmem_destroy_inode,
3735#ifdef CONFIG_TMPFS
3736	.statfs		= shmem_statfs,
3737	.remount_fs	= shmem_remount_fs,
3738	.show_options	= shmem_show_options,
3739#endif
3740	.evict_inode	= shmem_evict_inode,
3741	.drop_inode	= generic_delete_inode,
3742	.put_super	= shmem_put_super,
3743#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
3744	.nr_cached_objects	= shmem_unused_huge_count,
3745	.free_cached_objects	= shmem_unused_huge_scan,
3746#endif
3747};
3748
3749static const struct vm_operations_struct shmem_vm_ops = {
3750	.fault		= shmem_fault,
3751	.map_pages	= filemap_map_pages,
3752#ifdef CONFIG_NUMA
3753	.set_policy     = shmem_set_policy,
3754	.get_policy     = shmem_get_policy,
3755#endif
3756};
3757
3758static struct dentry *shmem_mount(struct file_system_type *fs_type,
3759	int flags, const char *dev_name, void *data)
3760{
3761	return mount_nodev(fs_type, flags, data, shmem_fill_super);
3762}
3763
3764static struct file_system_type shmem_fs_type = {
3765	.owner		= THIS_MODULE,
3766	.name		= "tmpfs",
3767	.mount		= shmem_mount,
3768	.kill_sb	= kill_litter_super,
3769	.fs_flags	= FS_USERNS_MOUNT,
3770};
3771
3772int __init shmem_init(void)
3773{
3774	int error;
3775
3776	/* If rootfs called this, don't re-init */
3777	if (shmem_inode_cachep)
3778		return 0;
3779
3780	error = shmem_init_inodecache();
3781	if (error)
3782		goto out3;
3783
3784	error = register_filesystem(&shmem_fs_type);
3785	if (error) {
3786		pr_err("Could not register tmpfs\n");
3787		goto out2;
3788	}
3789
3790	shm_mnt = kern_mount(&shmem_fs_type);
3791	if (IS_ERR(shm_mnt)) {
3792		error = PTR_ERR(shm_mnt);
3793		pr_err("Could not kern_mount tmpfs\n");
3794		goto out1;
3795	}
3796
3797#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
3798	if (has_transparent_hugepage() && shmem_huge < SHMEM_HUGE_DENY)
3799		SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
3800	else
3801		shmem_huge = 0; /* just in case it was patched */
3802#endif
3803	return 0;
3804
3805out1:
3806	unregister_filesystem(&shmem_fs_type);
3807out2:
3808	shmem_destroy_inodecache();
3809out3:
3810	shm_mnt = ERR_PTR(error);
3811	return error;
3812}
3813
3814#if defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && defined(CONFIG_SYSFS)
3815static ssize_t shmem_enabled_show(struct kobject *kobj,
3816		struct kobj_attribute *attr, char *buf)
3817{
3818	int values[] = {
3819		SHMEM_HUGE_ALWAYS,
3820		SHMEM_HUGE_WITHIN_SIZE,
3821		SHMEM_HUGE_ADVISE,
3822		SHMEM_HUGE_NEVER,
3823		SHMEM_HUGE_DENY,
3824		SHMEM_HUGE_FORCE,
3825	};
3826	int i, count;
3827
3828	for (i = 0, count = 0; i < ARRAY_SIZE(values); i++) {
3829		const char *fmt = shmem_huge == values[i] ? "[%s] " : "%s ";
3830
3831		count += sprintf(buf + count, fmt,
3832				shmem_format_huge(values[i]));
3833	}
3834	buf[count - 1] = '\n';
3835	return count;
3836}
3837
3838static ssize_t shmem_enabled_store(struct kobject *kobj,
3839		struct kobj_attribute *attr, const char *buf, size_t count)
3840{
3841	char tmp[16];
3842	int huge;
3843
3844	if (count + 1 > sizeof(tmp))
3845		return -EINVAL;
3846	memcpy(tmp, buf, count);
3847	tmp[count] = '\0';
3848	if (count && tmp[count - 1] == '\n')
3849		tmp[count - 1] = '\0';
3850
3851	huge = shmem_parse_huge(tmp);
3852	if (huge == -EINVAL)
3853		return -EINVAL;
3854	if (!has_transparent_hugepage() &&
3855			huge != SHMEM_HUGE_NEVER && huge != SHMEM_HUGE_DENY)
3856		return -EINVAL;
3857
3858	shmem_huge = huge;
3859	if (shmem_huge < SHMEM_HUGE_DENY)
3860		SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
3861	return count;
3862}
3863
3864struct kobj_attribute shmem_enabled_attr =
3865	__ATTR(shmem_enabled, 0644, shmem_enabled_show, shmem_enabled_store);
3866#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE && CONFIG_SYSFS */
3867
3868#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
3869bool shmem_huge_enabled(struct vm_area_struct *vma)
3870{
3871	struct inode *inode = file_inode(vma->vm_file);
3872	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
3873	loff_t i_size;
3874	pgoff_t off;
3875
3876	if (shmem_huge == SHMEM_HUGE_FORCE)
3877		return true;
3878	if (shmem_huge == SHMEM_HUGE_DENY)
3879		return false;
3880	switch (sbinfo->huge) {
3881		case SHMEM_HUGE_NEVER:
3882			return false;
3883		case SHMEM_HUGE_ALWAYS:
3884			return true;
3885		case SHMEM_HUGE_WITHIN_SIZE:
3886			off = round_up(vma->vm_pgoff, HPAGE_PMD_NR);
3887			i_size = round_up(i_size_read(inode), PAGE_SIZE);
3888			if (i_size >= HPAGE_PMD_SIZE &&
3889					i_size >> PAGE_SHIFT >= off)
3890				return true;
3891		case SHMEM_HUGE_ADVISE:
3892			/* TODO: implement fadvise() hints */
3893			return (vma->vm_flags & VM_HUGEPAGE);
3894		default:
3895			VM_BUG_ON(1);
3896			return false;
3897	}
3898}
3899#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE */
3900
3901#else /* !CONFIG_SHMEM */
3902
3903/*
3904 * tiny-shmem: simple shmemfs and tmpfs using ramfs code
3905 *
3906 * This is intended for small system where the benefits of the full
3907 * shmem code (swap-backed and resource-limited) are outweighed by
3908 * their complexity. On systems without swap this code should be
3909 * effectively equivalent, but much lighter weight.
3910 */
3911
3912static struct file_system_type shmem_fs_type = {
3913	.name		= "tmpfs",
3914	.mount		= ramfs_mount,
3915	.kill_sb	= kill_litter_super,
3916	.fs_flags	= FS_USERNS_MOUNT,
3917};
3918
3919int __init shmem_init(void)
3920{
3921	BUG_ON(register_filesystem(&shmem_fs_type) != 0);
3922
3923	shm_mnt = kern_mount(&shmem_fs_type);
3924	BUG_ON(IS_ERR(shm_mnt));
3925
3926	return 0;
3927}
3928
3929int shmem_unuse(swp_entry_t swap, struct page *page)
3930{
3931	return 0;
3932}
3933
3934int shmem_lock(struct file *file, int lock, struct user_struct *user)
3935{
3936	return 0;
3937}
3938
3939void shmem_unlock_mapping(struct address_space *mapping)
3940{
3941}
3942
3943#ifdef CONFIG_MMU
3944unsigned long shmem_get_unmapped_area(struct file *file,
3945				      unsigned long addr, unsigned long len,
3946				      unsigned long pgoff, unsigned long flags)
3947{
3948	return current->mm->get_unmapped_area(file, addr, len, pgoff, flags);
3949}
3950#endif
3951
3952void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
3953{
3954	truncate_inode_pages_range(inode->i_mapping, lstart, lend);
3955}
3956EXPORT_SYMBOL_GPL(shmem_truncate_range);
3957
3958#define shmem_vm_ops				generic_file_vm_ops
3959#define shmem_file_operations			ramfs_file_operations
3960#define shmem_get_inode(sb, dir, mode, dev, flags)	ramfs_get_inode(sb, dir, mode, dev)
3961#define shmem_acct_size(flags, size)		0
3962#define shmem_unacct_size(flags, size)		do {} while (0)
3963
3964#endif /* CONFIG_SHMEM */
3965
3966/* common code */
3967
3968static const struct dentry_operations anon_ops = {
3969	.d_dname = simple_dname
3970};
3971
3972static struct file *__shmem_file_setup(const char *name, loff_t size,
3973				       unsigned long flags, unsigned int i_flags)
3974{
3975	struct file *res;
3976	struct inode *inode;
3977	struct path path;
3978	struct super_block *sb;
3979	struct qstr this;
3980
3981	if (IS_ERR(shm_mnt))
3982		return ERR_CAST(shm_mnt);
3983
3984	if (size < 0 || size > MAX_LFS_FILESIZE)
3985		return ERR_PTR(-EINVAL);
3986
3987	if (shmem_acct_size(flags, size))
3988		return ERR_PTR(-ENOMEM);
3989
3990	res = ERR_PTR(-ENOMEM);
3991	this.name = name;
3992	this.len = strlen(name);
3993	this.hash = 0; /* will go */
3994	sb = shm_mnt->mnt_sb;
3995	path.mnt = mntget(shm_mnt);
3996	path.dentry = d_alloc_pseudo(sb, &this);
3997	if (!path.dentry)
3998		goto put_memory;
3999	d_set_d_op(path.dentry, &anon_ops);
4000
4001	res = ERR_PTR(-ENOSPC);
4002	inode = shmem_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0, flags);
4003	if (!inode)
4004		goto put_memory;
4005
4006	inode->i_flags |= i_flags;
4007	d_instantiate(path.dentry, inode);
4008	inode->i_size = size;
4009	clear_nlink(inode);	/* It is unlinked */
4010	res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size));
4011	if (IS_ERR(res))
4012		goto put_path;
4013
4014	res = alloc_file(&path, FMODE_WRITE | FMODE_READ,
4015		  &shmem_file_operations);
4016	if (IS_ERR(res))
4017		goto put_path;
4018
4019	return res;
4020
4021put_memory:
4022	shmem_unacct_size(flags, size);
4023put_path:
4024	path_put(&path);
4025	return res;
4026}
4027
4028/**
4029 * shmem_kernel_file_setup - get an unlinked file living in tmpfs which must be
4030 * 	kernel internal.  There will be NO LSM permission checks against the
4031 * 	underlying inode.  So users of this interface must do LSM checks at a
4032 *	higher layer.  The users are the big_key and shm implementations.  LSM
4033 *	checks are provided at the key or shm level rather than the inode.
4034 * @name: name for dentry (to be seen in /proc/<pid>/maps
4035 * @size: size to be set for the file
4036 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
4037 */
4038struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags)
4039{
4040	return __shmem_file_setup(name, size, flags, S_PRIVATE);
4041}
4042
4043/**
4044 * shmem_file_setup - get an unlinked file living in tmpfs
4045 * @name: name for dentry (to be seen in /proc/<pid>/maps
4046 * @size: size to be set for the file
4047 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
4048 */
4049struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags)
4050{
4051	return __shmem_file_setup(name, size, flags, 0);
4052}
4053EXPORT_SYMBOL_GPL(shmem_file_setup);
4054
4055/**
4056 * shmem_zero_setup - setup a shared anonymous mapping
4057 * @vma: the vma to be mmapped is prepared by do_mmap_pgoff
4058 */
4059int shmem_zero_setup(struct vm_area_struct *vma)
4060{
4061	struct file *file;
4062	loff_t size = vma->vm_end - vma->vm_start;
4063
4064	/*
4065	 * Cloning a new file under mmap_sem leads to a lock ordering conflict
4066	 * between XFS directory reading and selinux: since this file is only
4067	 * accessible to the user through its mapping, use S_PRIVATE flag to
4068	 * bypass file security, in the same way as shmem_kernel_file_setup().
4069	 */
4070	file = __shmem_file_setup("dev/zero", size, vma->vm_flags, S_PRIVATE);
4071	if (IS_ERR(file))
4072		return PTR_ERR(file);
4073
4074	if (vma->vm_file)
4075		fput(vma->vm_file);
4076	vma->vm_file = file;
4077	vma->vm_ops = &shmem_vm_ops;
4078
4079	if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &&
4080			((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) <
4081			(vma->vm_end & HPAGE_PMD_MASK)) {
4082		khugepaged_enter(vma, vma->vm_flags);
4083	}
4084
4085	return 0;
4086}
4087
4088/**
4089 * shmem_read_mapping_page_gfp - read into page cache, using specified page allocation flags.
4090 * @mapping:	the page's address_space
4091 * @index:	the page index
4092 * @gfp:	the page allocator flags to use if allocating
4093 *
4094 * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)",
4095 * with any new page allocations done using the specified allocation flags.
4096 * But read_cache_page_gfp() uses the ->readpage() method: which does not
4097 * suit tmpfs, since it may have pages in swapcache, and needs to find those
4098 * for itself; although drivers/gpu/drm i915 and ttm rely upon this support.
4099 *
4100 * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in
4101 * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily.
4102 */
4103struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
4104					 pgoff_t index, gfp_t gfp)
4105{
4106#ifdef CONFIG_SHMEM
4107	struct inode *inode = mapping->host;
4108	struct page *page;
4109	int error;
4110
4111	BUG_ON(mapping->a_ops != &shmem_aops);
4112	error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE,
4113				  gfp, NULL, NULL);
4114	if (error)
4115		page = ERR_PTR(error);
4116	else
4117		unlock_page(page);
4118	return page;
4119#else
4120	/*
4121	 * The tiny !SHMEM case uses ramfs without swap
4122	 */
4123	return read_cache_page_gfp(mapping, index, gfp);
4124#endif
4125}
4126EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp);