fs/namei.c at master · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / fs / namei.c
at master 174 kB view raw
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 *  linux/fs/namei.c
   4 *
   5 *  Copyright (C) 1991, 1992  Linus Torvalds
   6 */
   7
   8/*
   9 * Some corrections by tytso.
  10 */
  11
  12/* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname
  13 * lookup logic.
  14 */
  15/* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture.
  16 */
  17
  18#include <linux/init.h>
  19#include <linux/export.h>
  20#include <linux/slab.h>
  21#include <linux/wordpart.h>
  22#include <linux/fs.h>
  23#include <linux/filelock.h>
  24#include <linux/namei.h>
  25#include <linux/pagemap.h>
  26#include <linux/sched/mm.h>
  27#include <linux/fsnotify.h>
  28#include <linux/personality.h>
  29#include <linux/security.h>
  30#include <linux/syscalls.h>
  31#include <linux/mount.h>
  32#include <linux/audit.h>
  33#include <linux/capability.h>
  34#include <linux/file.h>
  35#include <linux/fcntl.h>
  36#include <linux/device_cgroup.h>
  37#include <linux/fs_struct.h>
  38#include <linux/posix_acl.h>
  39#include <linux/hash.h>
  40#include <linux/bitops.h>
  41#include <linux/init_task.h>
  42#include <linux/uaccess.h>
  43
  44#include "internal.h"
  45#include "mount.h"
  46
  47/* [Feb-1997 T. Schoebel-Theuer]
  48 * Fundamental changes in the pathname lookup mechanisms (namei)
  49 * were necessary because of omirr.  The reason is that omirr needs
  50 * to know the _real_ pathname, not the user-supplied one, in case
  51 * of symlinks (and also when transname replacements occur).
  52 *
  53 * The new code replaces the old recursive symlink resolution with
  54 * an iterative one (in case of non-nested symlink chains).  It does
  55 * this with calls to <fs>_follow_link().
  56 * As a side effect, dir_namei(), _namei() and follow_link() are now 
  57 * replaced with a single function lookup_dentry() that can handle all 
  58 * the special cases of the former code.
  59 *
  60 * With the new dcache, the pathname is stored at each inode, at least as
  61 * long as the refcount of the inode is positive.  As a side effect, the
  62 * size of the dcache depends on the inode cache and thus is dynamic.
  63 *
  64 * [29-Apr-1998 C. Scott Ananian] Updated above description of symlink
  65 * resolution to correspond with current state of the code.
  66 *
  67 * Note that the symlink resolution is not *completely* iterative.
  68 * There is still a significant amount of tail- and mid- recursion in
  69 * the algorithm.  Also, note that <fs>_readlink() is not used in
  70 * lookup_dentry(): lookup_dentry() on the result of <fs>_readlink()
  71 * may return different results than <fs>_follow_link().  Many virtual
  72 * filesystems (including /proc) exhibit this behavior.
  73 */
  74
  75/* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation:
  76 * New symlink semantics: when open() is called with flags O_CREAT | O_EXCL
  77 * and the name already exists in form of a symlink, try to create the new
  78 * name indicated by the symlink. The old code always complained that the
  79 * name already exists, due to not following the symlink even if its target
  80 * is nonexistent.  The new semantics affects also mknod() and link() when
  81 * the name is a symlink pointing to a non-existent name.
  82 *
  83 * I don't know which semantics is the right one, since I have no access
  84 * to standards. But I found by trial that HP-UX 9.0 has the full "new"
  85 * semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the
  86 * "old" one. Personally, I think the new semantics is much more logical.
  87 * Note that "ln old new" where "new" is a symlink pointing to a non-existing
  88 * file does succeed in both HP-UX and SunOs, but not in Solaris
  89 * and in the old Linux semantics.
  90 */
  91
  92/* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink
  93 * semantics.  See the comments in "open_namei" and "do_link" below.
  94 *
  95 * [10-Sep-98 Alan Modra] Another symlink change.
  96 */
  97
  98/* [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks:
  99 *	inside the path - always follow.
 100 *	in the last component in creation/removal/renaming - never follow.
 101 *	if LOOKUP_FOLLOW passed - follow.
 102 *	if the pathname has trailing slashes - follow.
 103 *	otherwise - don't follow.
 104 * (applied in that order).
 105 *
 106 * [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT
 107 * restored for 2.4. This is the last surviving part of old 4.2BSD bug.
 108 * During the 2.4 we need to fix the userland stuff depending on it -
 109 * hopefully we will be able to get rid of that wart in 2.5. So far only
 110 * XEmacs seems to be relying on it...
 111 */
 112/*
 113 * [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland)
 114 * implemented.  Let's see if raised priority of ->s_vfs_rename_mutex gives
 115 * any extra contention...
 116 */
 117
 118/* In order to reduce some races, while at the same time doing additional
 119 * checking and hopefully speeding things up, we copy filenames to the
 120 * kernel data space before using them..
 121 *
 122 * POSIX.1 2.4: an empty pathname is invalid (ENOENT).
 123 * PATH_MAX includes the nul terminator --RR.
 124 */
 125
 126#define EMBEDDED_NAME_MAX	(PATH_MAX - offsetof(struct filename, iname))
 127
 128static inline void initname(struct filename *name, const char __user *uptr)
 129{
 130	name->uptr = uptr;
 131	name->aname = NULL;
 132	atomic_set(&name->refcnt, 1);
 133}
 134
 135struct filename *
 136getname_flags(const char __user *filename, int flags)
 137{
 138	struct filename *result;
 139	char *kname;
 140	int len;
 141
 142	result = audit_reusename(filename);
 143	if (result)
 144		return result;
 145
 146	result = __getname();
 147	if (unlikely(!result))
 148		return ERR_PTR(-ENOMEM);
 149
 150	/*
 151	 * First, try to embed the struct filename inside the names_cache
 152	 * allocation
 153	 */
 154	kname = (char *)result->iname;
 155	result->name = kname;
 156
 157	len = strncpy_from_user(kname, filename, EMBEDDED_NAME_MAX);
 158	/*
 159	 * Handle both empty path and copy failure in one go.
 160	 */
 161	if (unlikely(len <= 0)) {
 162		if (unlikely(len < 0)) {
 163			__putname(result);
 164			return ERR_PTR(len);
 165		}
 166
 167		/* The empty path is special. */
 168		if (!(flags & LOOKUP_EMPTY)) {
 169			__putname(result);
 170			return ERR_PTR(-ENOENT);
 171		}
 172	}
 173
 174	/*
 175	 * Uh-oh. We have a name that's approaching PATH_MAX. Allocate a
 176	 * separate struct filename so we can dedicate the entire
 177	 * names_cache allocation for the pathname, and re-do the copy from
 178	 * userland.
 179	 */
 180	if (unlikely(len == EMBEDDED_NAME_MAX)) {
 181		const size_t size = offsetof(struct filename, iname[1]);
 182		kname = (char *)result;
 183
 184		/*
 185		 * size is chosen that way we to guarantee that
 186		 * result->iname[0] is within the same object and that
 187		 * kname can't be equal to result->iname, no matter what.
 188		 */
 189		result = kzalloc(size, GFP_KERNEL);
 190		if (unlikely(!result)) {
 191			__putname(kname);
 192			return ERR_PTR(-ENOMEM);
 193		}
 194		result->name = kname;
 195		len = strncpy_from_user(kname, filename, PATH_MAX);
 196		if (unlikely(len < 0)) {
 197			__putname(kname);
 198			kfree(result);
 199			return ERR_PTR(len);
 200		}
 201		/* The empty path is special. */
 202		if (unlikely(!len) && !(flags & LOOKUP_EMPTY)) {
 203			__putname(kname);
 204			kfree(result);
 205			return ERR_PTR(-ENOENT);
 206		}
 207		if (unlikely(len == PATH_MAX)) {
 208			__putname(kname);
 209			kfree(result);
 210			return ERR_PTR(-ENAMETOOLONG);
 211		}
 212	}
 213	initname(result, filename);
 214	audit_getname(result);
 215	return result;
 216}
 217
 218struct filename *getname_uflags(const char __user *filename, int uflags)
 219{
 220	int flags = (uflags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0;
 221
 222	return getname_flags(filename, flags);
 223}
 224
 225struct filename *__getname_maybe_null(const char __user *pathname)
 226{
 227	struct filename *name;
 228	char c;
 229
 230	/* try to save on allocations; loss on um, though */
 231	if (get_user(c, pathname))
 232		return ERR_PTR(-EFAULT);
 233	if (!c)
 234		return NULL;
 235
 236	name = getname_flags(pathname, LOOKUP_EMPTY);
 237	if (!IS_ERR(name) && !(name->name[0])) {
 238		putname(name);
 239		name = NULL;
 240	}
 241	return name;
 242}
 243
 244struct filename *getname_kernel(const char * filename)
 245{
 246	struct filename *result;
 247	int len = strlen(filename) + 1;
 248
 249	result = __getname();
 250	if (unlikely(!result))
 251		return ERR_PTR(-ENOMEM);
 252
 253	if (len <= EMBEDDED_NAME_MAX) {
 254		result->name = (char *)result->iname;
 255	} else if (len <= PATH_MAX) {
 256		const size_t size = offsetof(struct filename, iname[1]);
 257		struct filename *tmp;
 258
 259		tmp = kmalloc(size, GFP_KERNEL);
 260		if (unlikely(!tmp)) {
 261			__putname(result);
 262			return ERR_PTR(-ENOMEM);
 263		}
 264		tmp->name = (char *)result;
 265		result = tmp;
 266	} else {
 267		__putname(result);
 268		return ERR_PTR(-ENAMETOOLONG);
 269	}
 270	memcpy((char *)result->name, filename, len);
 271	initname(result, NULL);
 272	audit_getname(result);
 273	return result;
 274}
 275EXPORT_SYMBOL(getname_kernel);
 276
 277void putname(struct filename *name)
 278{
 279	int refcnt;
 280
 281	if (IS_ERR_OR_NULL(name))
 282		return;
 283
 284	refcnt = atomic_read(&name->refcnt);
 285	if (unlikely(refcnt != 1)) {
 286		if (WARN_ON_ONCE(!refcnt))
 287			return;
 288
 289		if (!atomic_dec_and_test(&name->refcnt))
 290			return;
 291	}
 292
 293	if (unlikely(name->name != name->iname)) {
 294		__putname(name->name);
 295		kfree(name);
 296	} else
 297		__putname(name);
 298}
 299EXPORT_SYMBOL(putname);
 300
 301/**
 302 * check_acl - perform ACL permission checking
 303 * @idmap:	idmap of the mount the inode was found from
 304 * @inode:	inode to check permissions on
 305 * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
 306 *
 307 * This function performs the ACL permission checking. Since this function
 308 * retrieve POSIX acls it needs to know whether it is called from a blocking or
 309 * non-blocking context and thus cares about the MAY_NOT_BLOCK bit.
 310 *
 311 * If the inode has been found through an idmapped mount the idmap of
 312 * the vfsmount must be passed through @idmap. This function will then take
 313 * care to map the inode according to @idmap before checking permissions.
 314 * On non-idmapped mounts or if permission checking is to be performed on the
 315 * raw inode simply pass @nop_mnt_idmap.
 316 */
 317static int check_acl(struct mnt_idmap *idmap,
 318		     struct inode *inode, int mask)
 319{
 320#ifdef CONFIG_FS_POSIX_ACL
 321	struct posix_acl *acl;
 322
 323	if (mask & MAY_NOT_BLOCK) {
 324		acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS);
 325	        if (!acl)
 326	                return -EAGAIN;
 327		/* no ->get_inode_acl() calls in RCU mode... */
 328		if (is_uncached_acl(acl))
 329			return -ECHILD;
 330	        return posix_acl_permission(idmap, inode, acl, mask);
 331	}
 332
 333	acl = get_inode_acl(inode, ACL_TYPE_ACCESS);
 334	if (IS_ERR(acl))
 335		return PTR_ERR(acl);
 336	if (acl) {
 337	        int error = posix_acl_permission(idmap, inode, acl, mask);
 338	        posix_acl_release(acl);
 339	        return error;
 340	}
 341#endif
 342
 343	return -EAGAIN;
 344}
 345
 346/*
 347 * Very quick optimistic "we know we have no ACL's" check.
 348 *
 349 * Note that this is purely for ACL_TYPE_ACCESS, and purely
 350 * for the "we have cached that there are no ACLs" case.
 351 *
 352 * If this returns true, we know there are no ACLs. But if
 353 * it returns false, we might still not have ACLs (it could
 354 * be the is_uncached_acl() case).
 355 */
 356static inline bool no_acl_inode(struct inode *inode)
 357{
 358#ifdef CONFIG_FS_POSIX_ACL
 359	return likely(!READ_ONCE(inode->i_acl));
 360#else
 361	return true;
 362#endif
 363}
 364
 365/**
 366 * acl_permission_check - perform basic UNIX permission checking
 367 * @idmap:	idmap of the mount the inode was found from
 368 * @inode:	inode to check permissions on
 369 * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
 370 *
 371 * This function performs the basic UNIX permission checking. Since this
 372 * function may retrieve POSIX acls it needs to know whether it is called from a
 373 * blocking or non-blocking context and thus cares about the MAY_NOT_BLOCK bit.
 374 *
 375 * If the inode has been found through an idmapped mount the idmap of
 376 * the vfsmount must be passed through @idmap. This function will then take
 377 * care to map the inode according to @idmap before checking permissions.
 378 * On non-idmapped mounts or if permission checking is to be performed on the
 379 * raw inode simply pass @nop_mnt_idmap.
 380 */
 381static int acl_permission_check(struct mnt_idmap *idmap,
 382				struct inode *inode, int mask)
 383{
 384	unsigned int mode = inode->i_mode;
 385	vfsuid_t vfsuid;
 386
 387	/*
 388	 * Common cheap case: everybody has the requested
 389	 * rights, and there are no ACLs to check. No need
 390	 * to do any owner/group checks in that case.
 391	 *
 392	 *  - 'mask&7' is the requested permission bit set
 393	 *  - multiplying by 0111 spreads them out to all of ugo
 394	 *  - '& ~mode' looks for missing inode permission bits
 395	 *  - the '!' is for "no missing permissions"
 396	 *
 397	 * After that, we just need to check that there are no
 398	 * ACL's on the inode - do the 'IS_POSIXACL()' check last
 399	 * because it will dereference the ->i_sb pointer and we
 400	 * want to avoid that if at all possible.
 401	 */
 402	if (!((mask & 7) * 0111 & ~mode)) {
 403		if (no_acl_inode(inode))
 404			return 0;
 405		if (!IS_POSIXACL(inode))
 406			return 0;
 407	}
 408
 409	/* Are we the owner? If so, ACL's don't matter */
 410	vfsuid = i_uid_into_vfsuid(idmap, inode);
 411	if (likely(vfsuid_eq_kuid(vfsuid, current_fsuid()))) {
 412		mask &= 7;
 413		mode >>= 6;
 414		return (mask & ~mode) ? -EACCES : 0;
 415	}
 416
 417	/* Do we have ACL's? */
 418	if (IS_POSIXACL(inode) && (mode & S_IRWXG)) {
 419		int error = check_acl(idmap, inode, mask);
 420		if (error != -EAGAIN)
 421			return error;
 422	}
 423
 424	/* Only RWX matters for group/other mode bits */
 425	mask &= 7;
 426
 427	/*
 428	 * Are the group permissions different from
 429	 * the other permissions in the bits we care
 430	 * about? Need to check group ownership if so.
 431	 */
 432	if (mask & (mode ^ (mode >> 3))) {
 433		vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode);
 434		if (vfsgid_in_group_p(vfsgid))
 435			mode >>= 3;
 436	}
 437
 438	/* Bits in 'mode' clear that we require? */
 439	return (mask & ~mode) ? -EACCES : 0;
 440}
 441
 442/**
 443 * generic_permission -  check for access rights on a Posix-like filesystem
 444 * @idmap:	idmap of the mount the inode was found from
 445 * @inode:	inode to check access rights for
 446 * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC,
 447 *		%MAY_NOT_BLOCK ...)
 448 *
 449 * Used to check for read/write/execute permissions on a file.
 450 * We use "fsuid" for this, letting us set arbitrary permissions
 451 * for filesystem access without changing the "normal" uids which
 452 * are used for other things.
 453 *
 454 * generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk
 455 * request cannot be satisfied (eg. requires blocking or too much complexity).
 456 * It would then be called again in ref-walk mode.
 457 *
 458 * If the inode has been found through an idmapped mount the idmap of
 459 * the vfsmount must be passed through @idmap. This function will then take
 460 * care to map the inode according to @idmap before checking permissions.
 461 * On non-idmapped mounts or if permission checking is to be performed on the
 462 * raw inode simply pass @nop_mnt_idmap.
 463 */
 464int generic_permission(struct mnt_idmap *idmap, struct inode *inode,
 465		       int mask)
 466{
 467	int ret;
 468
 469	/*
 470	 * Do the basic permission checks.
 471	 */
 472	ret = acl_permission_check(idmap, inode, mask);
 473	if (ret != -EACCES)
 474		return ret;
 475
 476	if (S_ISDIR(inode->i_mode)) {
 477		/* DACs are overridable for directories */
 478		if (!(mask & MAY_WRITE))
 479			if (capable_wrt_inode_uidgid(idmap, inode,
 480						     CAP_DAC_READ_SEARCH))
 481				return 0;
 482		if (capable_wrt_inode_uidgid(idmap, inode,
 483					     CAP_DAC_OVERRIDE))
 484			return 0;
 485		return -EACCES;
 486	}
 487
 488	/*
 489	 * Searching includes executable on directories, else just read.
 490	 */
 491	mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
 492	if (mask == MAY_READ)
 493		if (capable_wrt_inode_uidgid(idmap, inode,
 494					     CAP_DAC_READ_SEARCH))
 495			return 0;
 496	/*
 497	 * Read/write DACs are always overridable.
 498	 * Executable DACs are overridable when there is
 499	 * at least one exec bit set.
 500	 */
 501	if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO))
 502		if (capable_wrt_inode_uidgid(idmap, inode,
 503					     CAP_DAC_OVERRIDE))
 504			return 0;
 505
 506	return -EACCES;
 507}
 508EXPORT_SYMBOL(generic_permission);
 509
 510/**
 511 * do_inode_permission - UNIX permission checking
 512 * @idmap:	idmap of the mount the inode was found from
 513 * @inode:	inode to check permissions on
 514 * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
 515 *
 516 * We _really_ want to just do "generic_permission()" without
 517 * even looking at the inode->i_op values. So we keep a cache
 518 * flag in inode->i_opflags, that says "this has not special
 519 * permission function, use the fast case".
 520 */
 521static inline int do_inode_permission(struct mnt_idmap *idmap,
 522				      struct inode *inode, int mask)
 523{
 524	if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) {
 525		if (likely(inode->i_op->permission))
 526			return inode->i_op->permission(idmap, inode, mask);
 527
 528		/* This gets set once for the inode lifetime */
 529		spin_lock(&inode->i_lock);
 530		inode->i_opflags |= IOP_FASTPERM;
 531		spin_unlock(&inode->i_lock);
 532	}
 533	return generic_permission(idmap, inode, mask);
 534}
 535
 536/**
 537 * sb_permission - Check superblock-level permissions
 538 * @sb: Superblock of inode to check permission on
 539 * @inode: Inode to check permission on
 540 * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
 541 *
 542 * Separate out file-system wide checks from inode-specific permission checks.
 543 *
 544 * Note: lookup_inode_permission_may_exec() does not call here. If you add
 545 * MAY_EXEC checks, adjust it.
 546 */
 547static int sb_permission(struct super_block *sb, struct inode *inode, int mask)
 548{
 549	if (mask & MAY_WRITE) {
 550		umode_t mode = inode->i_mode;
 551
 552		/* Nobody gets write access to a read-only fs. */
 553		if (sb_rdonly(sb) && (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
 554			return -EROFS;
 555	}
 556	return 0;
 557}
 558
 559/**
 560 * inode_permission - Check for access rights to a given inode
 561 * @idmap:	idmap of the mount the inode was found from
 562 * @inode:	Inode to check permission on
 563 * @mask:	Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
 564 *
 565 * Check for read/write/execute permissions on an inode.  We use fs[ug]id for
 566 * this, letting us set arbitrary permissions for filesystem access without
 567 * changing the "normal" UIDs which are used for other things.
 568 *
 569 * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
 570 */
 571int inode_permission(struct mnt_idmap *idmap,
 572		     struct inode *inode, int mask)
 573{
 574	int retval;
 575
 576	retval = sb_permission(inode->i_sb, inode, mask);
 577	if (unlikely(retval))
 578		return retval;
 579
 580	if (mask & MAY_WRITE) {
 581		/*
 582		 * Nobody gets write access to an immutable file.
 583		 */
 584		if (unlikely(IS_IMMUTABLE(inode)))
 585			return -EPERM;
 586
 587		/*
 588		 * Updating mtime will likely cause i_uid and i_gid to be
 589		 * written back improperly if their true value is unknown
 590		 * to the vfs.
 591		 */
 592		if (unlikely(HAS_UNMAPPED_ID(idmap, inode)))
 593			return -EACCES;
 594	}
 595
 596	retval = do_inode_permission(idmap, inode, mask);
 597	if (unlikely(retval))
 598		return retval;
 599
 600	retval = devcgroup_inode_permission(inode, mask);
 601	if (unlikely(retval))
 602		return retval;
 603
 604	return security_inode_permission(inode, mask);
 605}
 606EXPORT_SYMBOL(inode_permission);
 607
 608/*
 609 * lookup_inode_permission_may_exec - Check traversal right for given inode
 610 *
 611 * This is a special case routine for may_lookup() making assumptions specific
 612 * to path traversal. Use inode_permission() if you are doing something else.
 613 *
 614 * Work is shaved off compared to inode_permission() as follows:
 615 * - we know for a fact there is no MAY_WRITE to worry about
 616 * - it is an invariant the inode is a directory
 617 *
 618 * Since majority of real-world traversal happens on inodes which grant it for
 619 * everyone, we check it upfront and only resort to more expensive work if it
 620 * fails.
 621 *
 622 * Filesystems which have their own ->permission hook and consequently miss out
 623 * on IOP_FASTPERM can still get the optimization if they set IOP_FASTPERM_MAY_EXEC
 624 * on their directory inodes.
 625 */
 626static __always_inline int lookup_inode_permission_may_exec(struct mnt_idmap *idmap,
 627	struct inode *inode, int mask)
 628{
 629	/* Lookup already checked this to return -ENOTDIR */
 630	VFS_BUG_ON_INODE(!S_ISDIR(inode->i_mode), inode);
 631	VFS_BUG_ON((mask & ~MAY_NOT_BLOCK) != 0);
 632
 633	mask |= MAY_EXEC;
 634
 635	if (unlikely(!(inode->i_opflags & (IOP_FASTPERM | IOP_FASTPERM_MAY_EXEC))))
 636		return inode_permission(idmap, inode, mask);
 637
 638	if (unlikely(((inode->i_mode & 0111) != 0111) || !no_acl_inode(inode)))
 639		return inode_permission(idmap, inode, mask);
 640
 641	return security_inode_permission(inode, mask);
 642}
 643
 644/**
 645 * path_get - get a reference to a path
 646 * @path: path to get the reference to
 647 *
 648 * Given a path increment the reference count to the dentry and the vfsmount.
 649 */
 650void path_get(const struct path *path)
 651{
 652	mntget(path->mnt);
 653	dget(path->dentry);
 654}
 655EXPORT_SYMBOL(path_get);
 656
 657/**
 658 * path_put - put a reference to a path
 659 * @path: path to put the reference to
 660 *
 661 * Given a path decrement the reference count to the dentry and the vfsmount.
 662 */
 663void path_put(const struct path *path)
 664{
 665	dput(path->dentry);
 666	mntput(path->mnt);
 667}
 668EXPORT_SYMBOL(path_put);
 669
 670#define EMBEDDED_LEVELS 2
 671struct nameidata {
 672	struct path	path;
 673	struct qstr	last;
 674	struct path	root;
 675	struct inode	*inode; /* path.dentry.d_inode */
 676	unsigned int	flags, state;
 677	unsigned	seq, next_seq, m_seq, r_seq;
 678	int		last_type;
 679	unsigned	depth;
 680	int		total_link_count;
 681	struct saved {
 682		struct path link;
 683		struct delayed_call done;
 684		const char *name;
 685		unsigned seq;
 686	} *stack, internal[EMBEDDED_LEVELS];
 687	struct filename	*name;
 688	const char *pathname;
 689	struct nameidata *saved;
 690	unsigned	root_seq;
 691	int		dfd;
 692	vfsuid_t	dir_vfsuid;
 693	umode_t		dir_mode;
 694} __randomize_layout;
 695
 696#define ND_ROOT_PRESET 1
 697#define ND_ROOT_GRABBED 2
 698#define ND_JUMPED 4
 699
 700static void __set_nameidata(struct nameidata *p, int dfd, struct filename *name)
 701{
 702	struct nameidata *old = current->nameidata;
 703	p->stack = p->internal;
 704	p->depth = 0;
 705	p->dfd = dfd;
 706	p->name = name;
 707	p->pathname = likely(name) ? name->name : "";
 708	p->path.mnt = NULL;
 709	p->path.dentry = NULL;
 710	p->total_link_count = old ? old->total_link_count : 0;
 711	p->saved = old;
 712	current->nameidata = p;
 713}
 714
 715static inline void set_nameidata(struct nameidata *p, int dfd, struct filename *name,
 716			  const struct path *root)
 717{
 718	__set_nameidata(p, dfd, name);
 719	p->state = 0;
 720	if (unlikely(root)) {
 721		p->state = ND_ROOT_PRESET;
 722		p->root = *root;
 723	}
 724}
 725
 726static void restore_nameidata(void)
 727{
 728	struct nameidata *now = current->nameidata, *old = now->saved;
 729
 730	current->nameidata = old;
 731	if (old)
 732		old->total_link_count = now->total_link_count;
 733	if (now->stack != now->internal)
 734		kfree(now->stack);
 735}
 736
 737static bool nd_alloc_stack(struct nameidata *nd)
 738{
 739	struct saved *p;
 740
 741	p= kmalloc_array(MAXSYMLINKS, sizeof(struct saved),
 742			 nd->flags & LOOKUP_RCU ? GFP_ATOMIC : GFP_KERNEL);
 743	if (unlikely(!p))
 744		return false;
 745	memcpy(p, nd->internal, sizeof(nd->internal));
 746	nd->stack = p;
 747	return true;
 748}
 749
 750/**
 751 * path_connected - Verify that a dentry is below mnt.mnt_root
 752 * @mnt: The mountpoint to check.
 753 * @dentry: The dentry to check.
 754 *
 755 * Rename can sometimes move a file or directory outside of a bind
 756 * mount, path_connected allows those cases to be detected.
 757 */
 758static bool path_connected(struct vfsmount *mnt, struct dentry *dentry)
 759{
 760	struct super_block *sb = mnt->mnt_sb;
 761
 762	/* Bind mounts can have disconnected paths */
 763	if (mnt->mnt_root == sb->s_root)
 764		return true;
 765
 766	return is_subdir(dentry, mnt->mnt_root);
 767}
 768
 769static void drop_links(struct nameidata *nd)
 770{
 771	int i = nd->depth;
 772	while (i--) {
 773		struct saved *last = nd->stack + i;
 774		do_delayed_call(&last->done);
 775		clear_delayed_call(&last->done);
 776	}
 777}
 778
 779static void leave_rcu(struct nameidata *nd)
 780{
 781	nd->flags &= ~LOOKUP_RCU;
 782	nd->seq = nd->next_seq = 0;
 783	rcu_read_unlock();
 784}
 785
 786static void terminate_walk(struct nameidata *nd)
 787{
 788	if (unlikely(nd->depth))
 789		drop_links(nd);
 790	if (!(nd->flags & LOOKUP_RCU)) {
 791		int i;
 792		path_put(&nd->path);
 793		for (i = 0; i < nd->depth; i++)
 794			path_put(&nd->stack[i].link);
 795		if (nd->state & ND_ROOT_GRABBED) {
 796			path_put(&nd->root);
 797			nd->state &= ~ND_ROOT_GRABBED;
 798		}
 799	} else {
 800		leave_rcu(nd);
 801	}
 802	nd->depth = 0;
 803	nd->path.mnt = NULL;
 804	nd->path.dentry = NULL;
 805}
 806
 807/* path_put is needed afterwards regardless of success or failure */
 808static bool __legitimize_path(struct path *path, unsigned seq, unsigned mseq)
 809{
 810	int res = __legitimize_mnt(path->mnt, mseq);
 811	if (unlikely(res)) {
 812		if (res > 0)
 813			path->mnt = NULL;
 814		path->dentry = NULL;
 815		return false;
 816	}
 817	if (unlikely(!lockref_get_not_dead(&path->dentry->d_lockref))) {
 818		path->dentry = NULL;
 819		return false;
 820	}
 821	return !read_seqcount_retry(&path->dentry->d_seq, seq);
 822}
 823
 824static inline bool legitimize_path(struct nameidata *nd,
 825			    struct path *path, unsigned seq)
 826{
 827	return __legitimize_path(path, seq, nd->m_seq);
 828}
 829
 830static bool legitimize_links(struct nameidata *nd)
 831{
 832	int i;
 833
 834	VFS_BUG_ON(nd->flags & LOOKUP_CACHED);
 835
 836	for (i = 0; i < nd->depth; i++) {
 837		struct saved *last = nd->stack + i;
 838		if (unlikely(!legitimize_path(nd, &last->link, last->seq))) {
 839			drop_links(nd);
 840			nd->depth = i + 1;
 841			return false;
 842		}
 843	}
 844	return true;
 845}
 846
 847static bool legitimize_root(struct nameidata *nd)
 848{
 849	/* Nothing to do if nd->root is zero or is managed by the VFS user. */
 850	if (!nd->root.mnt || (nd->state & ND_ROOT_PRESET))
 851		return true;
 852	nd->state |= ND_ROOT_GRABBED;
 853	return legitimize_path(nd, &nd->root, nd->root_seq);
 854}
 855
 856/*
 857 * Path walking has 2 modes, rcu-walk and ref-walk (see
 858 * Documentation/filesystems/path-lookup.txt).  In situations when we can't
 859 * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab
 860 * normal reference counts on dentries and vfsmounts to transition to ref-walk
 861 * mode.  Refcounts are grabbed at the last known good point before rcu-walk
 862 * got stuck, so ref-walk may continue from there. If this is not successful
 863 * (eg. a seqcount has changed), then failure is returned and it's up to caller
 864 * to restart the path walk from the beginning in ref-walk mode.
 865 */
 866
 867/**
 868 * try_to_unlazy - try to switch to ref-walk mode.
 869 * @nd: nameidata pathwalk data
 870 * Returns: true on success, false on failure
 871 *
 872 * try_to_unlazy attempts to legitimize the current nd->path and nd->root
 873 * for ref-walk mode.
 874 * Must be called from rcu-walk context.
 875 * Nothing should touch nameidata between try_to_unlazy() failure and
 876 * terminate_walk().
 877 */
 878static bool try_to_unlazy(struct nameidata *nd)
 879{
 880	struct dentry *parent = nd->path.dentry;
 881
 882	BUG_ON(!(nd->flags & LOOKUP_RCU));
 883
 884	if (unlikely(nd->flags & LOOKUP_CACHED)) {
 885		drop_links(nd);
 886		nd->depth = 0;
 887		goto out1;
 888	}
 889	if (unlikely(nd->depth && !legitimize_links(nd)))
 890		goto out1;
 891	if (unlikely(!legitimize_path(nd, &nd->path, nd->seq)))
 892		goto out;
 893	if (unlikely(!legitimize_root(nd)))
 894		goto out;
 895	leave_rcu(nd);
 896	BUG_ON(nd->inode != parent->d_inode);
 897	return true;
 898
 899out1:
 900	nd->path.mnt = NULL;
 901	nd->path.dentry = NULL;
 902out:
 903	leave_rcu(nd);
 904	return false;
 905}
 906
 907/**
 908 * try_to_unlazy_next - try to switch to ref-walk mode.
 909 * @nd: nameidata pathwalk data
 910 * @dentry: next dentry to step into
 911 * Returns: true on success, false on failure
 912 *
 913 * Similar to try_to_unlazy(), but here we have the next dentry already
 914 * picked by rcu-walk and want to legitimize that in addition to the current
 915 * nd->path and nd->root for ref-walk mode.  Must be called from rcu-walk context.
 916 * Nothing should touch nameidata between try_to_unlazy_next() failure and
 917 * terminate_walk().
 918 */
 919static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry)
 920{
 921	int res;
 922	BUG_ON(!(nd->flags & LOOKUP_RCU));
 923
 924	if (unlikely(nd->flags & LOOKUP_CACHED)) {
 925		drop_links(nd);
 926		nd->depth = 0;
 927		goto out2;
 928	}
 929	if (unlikely(nd->depth && !legitimize_links(nd)))
 930		goto out2;
 931	res = __legitimize_mnt(nd->path.mnt, nd->m_seq);
 932	if (unlikely(res)) {
 933		if (res > 0)
 934			goto out2;
 935		goto out1;
 936	}
 937	if (unlikely(!lockref_get_not_dead(&nd->path.dentry->d_lockref)))
 938		goto out1;
 939
 940	/*
 941	 * We need to move both the parent and the dentry from the RCU domain
 942	 * to be properly refcounted. And the sequence number in the dentry
 943	 * validates *both* dentry counters, since we checked the sequence
 944	 * number of the parent after we got the child sequence number. So we
 945	 * know the parent must still be valid if the child sequence number is
 946	 */
 947	if (unlikely(!lockref_get_not_dead(&dentry->d_lockref)))
 948		goto out;
 949	if (read_seqcount_retry(&dentry->d_seq, nd->next_seq))
 950		goto out_dput;
 951	/*
 952	 * Sequence counts matched. Now make sure that the root is
 953	 * still valid and get it if required.
 954	 */
 955	if (unlikely(!legitimize_root(nd)))
 956		goto out_dput;
 957	leave_rcu(nd);
 958	return true;
 959
 960out2:
 961	nd->path.mnt = NULL;
 962out1:
 963	nd->path.dentry = NULL;
 964out:
 965	leave_rcu(nd);
 966	return false;
 967out_dput:
 968	leave_rcu(nd);
 969	dput(dentry);
 970	return false;
 971}
 972
 973static inline int d_revalidate(struct inode *dir, const struct qstr *name,
 974			       struct dentry *dentry, unsigned int flags)
 975{
 976	if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE))
 977		return dentry->d_op->d_revalidate(dir, name, dentry, flags);
 978	else
 979		return 1;
 980}
 981
 982/**
 983 * complete_walk - successful completion of path walk
 984 * @nd:  pointer nameidata
 985 *
 986 * If we had been in RCU mode, drop out of it and legitimize nd->path.
 987 * Revalidate the final result, unless we'd already done that during
 988 * the path walk or the filesystem doesn't ask for it.  Return 0 on
 989 * success, -error on failure.  In case of failure caller does not
 990 * need to drop nd->path.
 991 */
 992static int complete_walk(struct nameidata *nd)
 993{
 994	struct dentry *dentry = nd->path.dentry;
 995	int status;
 996
 997	if (nd->flags & LOOKUP_RCU) {
 998		/*
 999		 * We don't want to zero nd->root for scoped-lookups or
1000		 * externally-managed nd->root.
1001		 */
1002		if (likely(!(nd->state & ND_ROOT_PRESET)))
1003			if (likely(!(nd->flags & LOOKUP_IS_SCOPED)))
1004				nd->root.mnt = NULL;
1005		nd->flags &= ~LOOKUP_CACHED;
1006		if (!try_to_unlazy(nd))
1007			return -ECHILD;
1008	}
1009
1010	if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) {
1011		/*
1012		 * While the guarantee of LOOKUP_IS_SCOPED is (roughly) "don't
1013		 * ever step outside the root during lookup" and should already
1014		 * be guaranteed by the rest of namei, we want to avoid a namei
1015		 * BUG resulting in userspace being given a path that was not
1016		 * scoped within the root at some point during the lookup.
1017		 *
1018		 * So, do a final sanity-check to make sure that in the
1019		 * worst-case scenario (a complete bypass of LOOKUP_IS_SCOPED)
1020		 * we won't silently return an fd completely outside of the
1021		 * requested root to userspace.
1022		 *
1023		 * Userspace could move the path outside the root after this
1024		 * check, but as discussed elsewhere this is not a concern (the
1025		 * resolved file was inside the root at some point).
1026		 */
1027		if (!path_is_under(&nd->path, &nd->root))
1028			return -EXDEV;
1029	}
1030
1031	if (likely(!(nd->state & ND_JUMPED)))
1032		return 0;
1033
1034	if (likely(!(dentry->d_flags & DCACHE_OP_WEAK_REVALIDATE)))
1035		return 0;
1036
1037	status = dentry->d_op->d_weak_revalidate(dentry, nd->flags);
1038	if (status > 0)
1039		return 0;
1040
1041	if (!status)
1042		status = -ESTALE;
1043
1044	return status;
1045}
1046
1047static int set_root(struct nameidata *nd)
1048{
1049	struct fs_struct *fs = current->fs;
1050
1051	/*
1052	 * Jumping to the real root in a scoped-lookup is a BUG in namei, but we
1053	 * still have to ensure it doesn't happen because it will cause a breakout
1054	 * from the dirfd.
1055	 */
1056	if (WARN_ON(nd->flags & LOOKUP_IS_SCOPED))
1057		return -ENOTRECOVERABLE;
1058
1059	if (nd->flags & LOOKUP_RCU) {
1060		unsigned seq;
1061
1062		do {
1063			seq = read_seqbegin(&fs->seq);
1064			nd->root = fs->root;
1065			nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
1066		} while (read_seqretry(&fs->seq, seq));
1067	} else {
1068		get_fs_root(fs, &nd->root);
1069		nd->state |= ND_ROOT_GRABBED;
1070	}
1071	return 0;
1072}
1073
1074static int nd_jump_root(struct nameidata *nd)
1075{
1076	if (unlikely(nd->flags & LOOKUP_BENEATH))
1077		return -EXDEV;
1078	if (unlikely(nd->flags & LOOKUP_NO_XDEV)) {
1079		/* Absolute path arguments to path_init() are allowed. */
1080		if (nd->path.mnt != NULL && nd->path.mnt != nd->root.mnt)
1081			return -EXDEV;
1082	}
1083	if (!nd->root.mnt) {
1084		int error = set_root(nd);
1085		if (unlikely(error))
1086			return error;
1087	}
1088	if (nd->flags & LOOKUP_RCU) {
1089		struct dentry *d;
1090		nd->path = nd->root;
1091		d = nd->path.dentry;
1092		nd->inode = d->d_inode;
1093		nd->seq = nd->root_seq;
1094		if (read_seqcount_retry(&d->d_seq, nd->seq))
1095			return -ECHILD;
1096	} else {
1097		path_put(&nd->path);
1098		nd->path = nd->root;
1099		path_get(&nd->path);
1100		nd->inode = nd->path.dentry->d_inode;
1101	}
1102	nd->state |= ND_JUMPED;
1103	return 0;
1104}
1105
1106/*
1107 * Helper to directly jump to a known parsed path from ->get_link,
1108 * caller must have taken a reference to path beforehand.
1109 */
1110int nd_jump_link(const struct path *path)
1111{
1112	int error = -ELOOP;
1113	struct nameidata *nd = current->nameidata;
1114
1115	if (unlikely(nd->flags & LOOKUP_NO_MAGICLINKS))
1116		goto err;
1117
1118	error = -EXDEV;
1119	if (unlikely(nd->flags & LOOKUP_NO_XDEV)) {
1120		if (nd->path.mnt != path->mnt)
1121			goto err;
1122	}
1123	/* Not currently safe for scoped-lookups. */
1124	if (unlikely(nd->flags & LOOKUP_IS_SCOPED))
1125		goto err;
1126
1127	path_put(&nd->path);
1128	nd->path = *path;
1129	nd->inode = nd->path.dentry->d_inode;
1130	nd->state |= ND_JUMPED;
1131	return 0;
1132
1133err:
1134	path_put(path);
1135	return error;
1136}
1137
1138static inline void put_link(struct nameidata *nd)
1139{
1140	struct saved *last = nd->stack + --nd->depth;
1141	do_delayed_call(&last->done);
1142	if (!(nd->flags & LOOKUP_RCU))
1143		path_put(&last->link);
1144}
1145
1146static int sysctl_protected_symlinks __read_mostly;
1147static int sysctl_protected_hardlinks __read_mostly;
1148static int sysctl_protected_fifos __read_mostly;
1149static int sysctl_protected_regular __read_mostly;
1150
1151#ifdef CONFIG_SYSCTL
1152static const struct ctl_table namei_sysctls[] = {
1153	{
1154		.procname	= "protected_symlinks",
1155		.data		= &sysctl_protected_symlinks,
1156		.maxlen		= sizeof(int),
1157		.mode		= 0644,
1158		.proc_handler	= proc_dointvec_minmax,
1159		.extra1		= SYSCTL_ZERO,
1160		.extra2		= SYSCTL_ONE,
1161	},
1162	{
1163		.procname	= "protected_hardlinks",
1164		.data		= &sysctl_protected_hardlinks,
1165		.maxlen		= sizeof(int),
1166		.mode		= 0644,
1167		.proc_handler	= proc_dointvec_minmax,
1168		.extra1		= SYSCTL_ZERO,
1169		.extra2		= SYSCTL_ONE,
1170	},
1171	{
1172		.procname	= "protected_fifos",
1173		.data		= &sysctl_protected_fifos,
1174		.maxlen		= sizeof(int),
1175		.mode		= 0644,
1176		.proc_handler	= proc_dointvec_minmax,
1177		.extra1		= SYSCTL_ZERO,
1178		.extra2		= SYSCTL_TWO,
1179	},
1180	{
1181		.procname	= "protected_regular",
1182		.data		= &sysctl_protected_regular,
1183		.maxlen		= sizeof(int),
1184		.mode		= 0644,
1185		.proc_handler	= proc_dointvec_minmax,
1186		.extra1		= SYSCTL_ZERO,
1187		.extra2		= SYSCTL_TWO,
1188	},
1189};
1190
1191static int __init init_fs_namei_sysctls(void)
1192{
1193	register_sysctl_init("fs", namei_sysctls);
1194	return 0;
1195}
1196fs_initcall(init_fs_namei_sysctls);
1197
1198#endif /* CONFIG_SYSCTL */
1199
1200/**
1201 * may_follow_link - Check symlink following for unsafe situations
1202 * @nd: nameidata pathwalk data
1203 * @inode: Used for idmapping.
1204 *
1205 * In the case of the sysctl_protected_symlinks sysctl being enabled,
1206 * CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is
1207 * in a sticky world-writable directory. This is to protect privileged
1208 * processes from failing races against path names that may change out
1209 * from under them by way of other users creating malicious symlinks.
1210 * It will permit symlinks to be followed only when outside a sticky
1211 * world-writable directory, or when the uid of the symlink and follower
1212 * match, or when the directory owner matches the symlink's owner.
1213 *
1214 * Returns 0 if following the symlink is allowed, -ve on error.
1215 */
1216static inline int may_follow_link(struct nameidata *nd, const struct inode *inode)
1217{
1218	struct mnt_idmap *idmap;
1219	vfsuid_t vfsuid;
1220
1221	if (!sysctl_protected_symlinks)
1222		return 0;
1223
1224	idmap = mnt_idmap(nd->path.mnt);
1225	vfsuid = i_uid_into_vfsuid(idmap, inode);
1226	/* Allowed if owner and follower match. */
1227	if (vfsuid_eq_kuid(vfsuid, current_fsuid()))
1228		return 0;
1229
1230	/* Allowed if parent directory not sticky and world-writable. */
1231	if ((nd->dir_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH))
1232		return 0;
1233
1234	/* Allowed if parent directory and link owner match. */
1235	if (vfsuid_valid(nd->dir_vfsuid) && vfsuid_eq(nd->dir_vfsuid, vfsuid))
1236		return 0;
1237
1238	if (nd->flags & LOOKUP_RCU)
1239		return -ECHILD;
1240
1241	audit_inode(nd->name, nd->stack[0].link.dentry, 0);
1242	audit_log_path_denied(AUDIT_ANOM_LINK, "follow_link");
1243	return -EACCES;
1244}
1245
1246/**
1247 * safe_hardlink_source - Check for safe hardlink conditions
1248 * @idmap: idmap of the mount the inode was found from
1249 * @inode: the source inode to hardlink from
1250 *
1251 * Return false if at least one of the following conditions:
1252 *    - inode is not a regular file
1253 *    - inode is setuid
1254 *    - inode is setgid and group-exec
1255 *    - access failure for read and write
1256 *
1257 * Otherwise returns true.
1258 */
1259static bool safe_hardlink_source(struct mnt_idmap *idmap,
1260				 struct inode *inode)
1261{
1262	umode_t mode = inode->i_mode;
1263
1264	/* Special files should not get pinned to the filesystem. */
1265	if (!S_ISREG(mode))
1266		return false;
1267
1268	/* Setuid files should not get pinned to the filesystem. */
1269	if (mode & S_ISUID)
1270		return false;
1271
1272	/* Executable setgid files should not get pinned to the filesystem. */
1273	if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
1274		return false;
1275
1276	/* Hardlinking to unreadable or unwritable sources is dangerous. */
1277	if (inode_permission(idmap, inode, MAY_READ | MAY_WRITE))
1278		return false;
1279
1280	return true;
1281}
1282
1283/**
1284 * may_linkat - Check permissions for creating a hardlink
1285 * @idmap: idmap of the mount the inode was found from
1286 * @link:  the source to hardlink from
1287 *
1288 * Block hardlink when all of:
1289 *  - sysctl_protected_hardlinks enabled
1290 *  - fsuid does not match inode
1291 *  - hardlink source is unsafe (see safe_hardlink_source() above)
1292 *  - not CAP_FOWNER in a namespace with the inode owner uid mapped
1293 *
1294 * If the inode has been found through an idmapped mount the idmap of
1295 * the vfsmount must be passed through @idmap. This function will then take
1296 * care to map the inode according to @idmap before checking permissions.
1297 * On non-idmapped mounts or if permission checking is to be performed on the
1298 * raw inode simply pass @nop_mnt_idmap.
1299 *
1300 * Returns 0 if successful, -ve on error.
1301 */
1302int may_linkat(struct mnt_idmap *idmap, const struct path *link)
1303{
1304	struct inode *inode = link->dentry->d_inode;
1305
1306	/* Inode writeback is not safe when the uid or gid are invalid. */
1307	if (!vfsuid_valid(i_uid_into_vfsuid(idmap, inode)) ||
1308	    !vfsgid_valid(i_gid_into_vfsgid(idmap, inode)))
1309		return -EOVERFLOW;
1310
1311	if (!sysctl_protected_hardlinks)
1312		return 0;
1313
1314	/* Source inode owner (or CAP_FOWNER) can hardlink all they like,
1315	 * otherwise, it must be a safe source.
1316	 */
1317	if (safe_hardlink_source(idmap, inode) ||
1318	    inode_owner_or_capable(idmap, inode))
1319		return 0;
1320
1321	audit_log_path_denied(AUDIT_ANOM_LINK, "linkat");
1322	return -EPERM;
1323}
1324
1325/**
1326 * may_create_in_sticky - Check whether an O_CREAT open in a sticky directory
1327 *			  should be allowed, or not, on files that already
1328 *			  exist.
1329 * @idmap: idmap of the mount the inode was found from
1330 * @nd: nameidata pathwalk data
1331 * @inode: the inode of the file to open
1332 *
1333 * Block an O_CREAT open of a FIFO (or a regular file) when:
1334 *   - sysctl_protected_fifos (or sysctl_protected_regular) is enabled
1335 *   - the file already exists
1336 *   - we are in a sticky directory
1337 *   - we don't own the file
1338 *   - the owner of the directory doesn't own the file
1339 *   - the directory is world writable
1340 * If the sysctl_protected_fifos (or sysctl_protected_regular) is set to 2
1341 * the directory doesn't have to be world writable: being group writable will
1342 * be enough.
1343 *
1344 * If the inode has been found through an idmapped mount the idmap of
1345 * the vfsmount must be passed through @idmap. This function will then take
1346 * care to map the inode according to @idmap before checking permissions.
1347 * On non-idmapped mounts or if permission checking is to be performed on the
1348 * raw inode simply pass @nop_mnt_idmap.
1349 *
1350 * Returns 0 if the open is allowed, -ve on error.
1351 */
1352static int may_create_in_sticky(struct mnt_idmap *idmap, struct nameidata *nd,
1353				struct inode *const inode)
1354{
1355	umode_t dir_mode = nd->dir_mode;
1356	vfsuid_t dir_vfsuid = nd->dir_vfsuid, i_vfsuid;
1357
1358	if (likely(!(dir_mode & S_ISVTX)))
1359		return 0;
1360
1361	if (S_ISREG(inode->i_mode) && !sysctl_protected_regular)
1362		return 0;
1363
1364	if (S_ISFIFO(inode->i_mode) && !sysctl_protected_fifos)
1365		return 0;
1366
1367	i_vfsuid = i_uid_into_vfsuid(idmap, inode);
1368
1369	if (vfsuid_eq(i_vfsuid, dir_vfsuid))
1370		return 0;
1371
1372	if (vfsuid_eq_kuid(i_vfsuid, current_fsuid()))
1373		return 0;
1374
1375	if (likely(dir_mode & 0002)) {
1376		audit_log_path_denied(AUDIT_ANOM_CREAT, "sticky_create");
1377		return -EACCES;
1378	}
1379
1380	if (dir_mode & 0020) {
1381		if (sysctl_protected_fifos >= 2 && S_ISFIFO(inode->i_mode)) {
1382			audit_log_path_denied(AUDIT_ANOM_CREAT,
1383					      "sticky_create_fifo");
1384			return -EACCES;
1385		}
1386
1387		if (sysctl_protected_regular >= 2 && S_ISREG(inode->i_mode)) {
1388			audit_log_path_denied(AUDIT_ANOM_CREAT,
1389					      "sticky_create_regular");
1390			return -EACCES;
1391		}
1392	}
1393
1394	return 0;
1395}
1396
1397/*
1398 * follow_up - Find the mountpoint of path's vfsmount
1399 *
1400 * Given a path, find the mountpoint of its source file system.
1401 * Replace @path with the path of the mountpoint in the parent mount.
1402 * Up is towards /.
1403 *
1404 * Return 1 if we went up a level and 0 if we were already at the
1405 * root.
1406 */
1407int follow_up(struct path *path)
1408{
1409	struct mount *mnt = real_mount(path->mnt);
1410	struct mount *parent;
1411	struct dentry *mountpoint;
1412
1413	read_seqlock_excl(&mount_lock);
1414	parent = mnt->mnt_parent;
1415	if (parent == mnt) {
1416		read_sequnlock_excl(&mount_lock);
1417		return 0;
1418	}
1419	mntget(&parent->mnt);
1420	mountpoint = dget(mnt->mnt_mountpoint);
1421	read_sequnlock_excl(&mount_lock);
1422	dput(path->dentry);
1423	path->dentry = mountpoint;
1424	mntput(path->mnt);
1425	path->mnt = &parent->mnt;
1426	return 1;
1427}
1428EXPORT_SYMBOL(follow_up);
1429
1430static bool choose_mountpoint_rcu(struct mount *m, const struct path *root,
1431				  struct path *path, unsigned *seqp)
1432{
1433	while (mnt_has_parent(m)) {
1434		struct dentry *mountpoint = m->mnt_mountpoint;
1435
1436		m = m->mnt_parent;
1437		if (unlikely(root->dentry == mountpoint &&
1438			     root->mnt == &m->mnt))
1439			break;
1440		if (mountpoint != m->mnt.mnt_root) {
1441			path->mnt = &m->mnt;
1442			path->dentry = mountpoint;
1443			*seqp = read_seqcount_begin(&mountpoint->d_seq);
1444			return true;
1445		}
1446	}
1447	return false;
1448}
1449
1450static bool choose_mountpoint(struct mount *m, const struct path *root,
1451			      struct path *path)
1452{
1453	bool found;
1454
1455	rcu_read_lock();
1456	while (1) {
1457		unsigned seq, mseq = read_seqbegin(&mount_lock);
1458
1459		found = choose_mountpoint_rcu(m, root, path, &seq);
1460		if (unlikely(!found)) {
1461			if (!read_seqretry(&mount_lock, mseq))
1462				break;
1463		} else {
1464			if (likely(__legitimize_path(path, seq, mseq)))
1465				break;
1466			rcu_read_unlock();
1467			path_put(path);
1468			rcu_read_lock();
1469		}
1470	}
1471	rcu_read_unlock();
1472	return found;
1473}
1474
1475/*
1476 * Perform an automount
1477 * - return -EISDIR to tell follow_managed() to stop and return the path we
1478 *   were called with.
1479 */
1480static int follow_automount(struct path *path, int *count, unsigned lookup_flags)
1481{
1482	struct dentry *dentry = path->dentry;
1483
1484	/* We don't want to mount if someone's just doing a stat -
1485	 * unless they're stat'ing a directory and appended a '/' to
1486	 * the name.
1487	 *
1488	 * We do, however, want to mount if someone wants to open or
1489	 * create a file of any type under the mountpoint, wants to
1490	 * traverse through the mountpoint or wants to open the
1491	 * mounted directory.  Also, autofs may mark negative dentries
1492	 * as being automount points.  These will need the attentions
1493	 * of the daemon to instantiate them before they can be used.
1494	 */
1495	if (!(lookup_flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
1496			   LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) &&
1497	    dentry->d_inode)
1498		return -EISDIR;
1499
1500	/* No need to trigger automounts if mountpoint crossing is disabled. */
1501	if (lookup_flags & LOOKUP_NO_XDEV)
1502		return -EXDEV;
1503
1504	if (count && (*count)++ >= MAXSYMLINKS)
1505		return -ELOOP;
1506
1507	return finish_automount(dentry->d_op->d_automount(path), path);
1508}
1509
1510/*
1511 * mount traversal - out-of-line part.  One note on ->d_flags accesses -
1512 * dentries are pinned but not locked here, so negative dentry can go
1513 * positive right under us.  Use of smp_load_acquire() provides a barrier
1514 * sufficient for ->d_inode and ->d_flags consistency.
1515 */
1516static int __traverse_mounts(struct path *path, unsigned flags, bool *jumped,
1517			     int *count, unsigned lookup_flags)
1518{
1519	struct vfsmount *mnt = path->mnt;
1520	bool need_mntput = false;
1521	int ret = 0;
1522
1523	while (flags & DCACHE_MANAGED_DENTRY) {
1524		/* Allow the filesystem to manage the transit without i_rwsem
1525		 * being held. */
1526		if (flags & DCACHE_MANAGE_TRANSIT) {
1527			if (lookup_flags & LOOKUP_NO_XDEV) {
1528				ret = -EXDEV;
1529				break;
1530			}
1531			ret = path->dentry->d_op->d_manage(path, false);
1532			flags = smp_load_acquire(&path->dentry->d_flags);
1533			if (ret < 0)
1534				break;
1535		}
1536
1537		if (flags & DCACHE_MOUNTED) {	// something's mounted on it..
1538			struct vfsmount *mounted = lookup_mnt(path);
1539			if (mounted) {		// ... in our namespace
1540				dput(path->dentry);
1541				if (need_mntput)
1542					mntput(path->mnt);
1543				path->mnt = mounted;
1544				path->dentry = dget(mounted->mnt_root);
1545				// here we know it's positive
1546				flags = path->dentry->d_flags;
1547				need_mntput = true;
1548				if (unlikely(lookup_flags & LOOKUP_NO_XDEV)) {
1549					ret = -EXDEV;
1550					break;
1551				}
1552				continue;
1553			}
1554		}
1555
1556		if (!(flags & DCACHE_NEED_AUTOMOUNT))
1557			break;
1558
1559		// uncovered automount point
1560		ret = follow_automount(path, count, lookup_flags);
1561		flags = smp_load_acquire(&path->dentry->d_flags);
1562		if (ret < 0)
1563			break;
1564	}
1565
1566	if (ret == -EISDIR)
1567		ret = 0;
1568	// possible if you race with several mount --move
1569	if (need_mntput && path->mnt == mnt)
1570		mntput(path->mnt);
1571	if (!ret && unlikely(d_flags_negative(flags)))
1572		ret = -ENOENT;
1573	*jumped = need_mntput;
1574	return ret;
1575}
1576
1577static inline int traverse_mounts(struct path *path, bool *jumped,
1578				  int *count, unsigned lookup_flags)
1579{
1580	unsigned flags = smp_load_acquire(&path->dentry->d_flags);
1581
1582	/* fastpath */
1583	if (likely(!(flags & DCACHE_MANAGED_DENTRY))) {
1584		*jumped = false;
1585		if (unlikely(d_flags_negative(flags)))
1586			return -ENOENT;
1587		return 0;
1588	}
1589	return __traverse_mounts(path, flags, jumped, count, lookup_flags);
1590}
1591
1592int follow_down_one(struct path *path)
1593{
1594	struct vfsmount *mounted;
1595
1596	mounted = lookup_mnt(path);
1597	if (mounted) {
1598		dput(path->dentry);
1599		mntput(path->mnt);
1600		path->mnt = mounted;
1601		path->dentry = dget(mounted->mnt_root);
1602		return 1;
1603	}
1604	return 0;
1605}
1606EXPORT_SYMBOL(follow_down_one);
1607
1608/*
1609 * Follow down to the covering mount currently visible to userspace.  At each
1610 * point, the filesystem owning that dentry may be queried as to whether the
1611 * caller is permitted to proceed or not.
1612 */
1613int follow_down(struct path *path, unsigned int flags)
1614{
1615	struct vfsmount *mnt = path->mnt;
1616	bool jumped;
1617	int ret = traverse_mounts(path, &jumped, NULL, flags);
1618
1619	if (path->mnt != mnt)
1620		mntput(mnt);
1621	return ret;
1622}
1623EXPORT_SYMBOL(follow_down);
1624
1625/*
1626 * Try to skip to top of mountpoint pile in rcuwalk mode.  Fail if
1627 * we meet a managed dentry that would need blocking.
1628 */
1629static bool __follow_mount_rcu(struct nameidata *nd, struct path *path)
1630{
1631	struct dentry *dentry = path->dentry;
1632	unsigned int flags = dentry->d_flags;
1633
1634	if (likely(!(flags & DCACHE_MANAGED_DENTRY)))
1635		return true;
1636
1637	if (unlikely(nd->flags & LOOKUP_NO_XDEV))
1638		return false;
1639
1640	for (;;) {
1641		/*
1642		 * Don't forget we might have a non-mountpoint managed dentry
1643		 * that wants to block transit.
1644		 */
1645		if (unlikely(flags & DCACHE_MANAGE_TRANSIT)) {
1646			int res = dentry->d_op->d_manage(path, true);
1647			if (res)
1648				return res == -EISDIR;
1649			flags = dentry->d_flags;
1650		}
1651
1652		if (flags & DCACHE_MOUNTED) {
1653			struct mount *mounted = __lookup_mnt(path->mnt, dentry);
1654			if (mounted) {
1655				path->mnt = &mounted->mnt;
1656				dentry = path->dentry = mounted->mnt.mnt_root;
1657				nd->state |= ND_JUMPED;
1658				nd->next_seq = read_seqcount_begin(&dentry->d_seq);
1659				flags = dentry->d_flags;
1660				// makes sure that non-RCU pathwalk could reach
1661				// this state.
1662				if (read_seqretry(&mount_lock, nd->m_seq))
1663					return false;
1664				continue;
1665			}
1666			if (read_seqretry(&mount_lock, nd->m_seq))
1667				return false;
1668		}
1669		return !(flags & DCACHE_NEED_AUTOMOUNT);
1670	}
1671}
1672
1673static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry,
1674			  struct path *path)
1675{
1676	bool jumped;
1677	int ret;
1678
1679	path->mnt = nd->path.mnt;
1680	path->dentry = dentry;
1681	if (nd->flags & LOOKUP_RCU) {
1682		unsigned int seq = nd->next_seq;
1683		if (likely(!d_managed(dentry)))
1684			return 0;
1685		if (likely(__follow_mount_rcu(nd, path)))
1686			return 0;
1687		// *path and nd->next_seq might've been clobbered
1688		path->mnt = nd->path.mnt;
1689		path->dentry = dentry;
1690		nd->next_seq = seq;
1691		if (unlikely(!try_to_unlazy_next(nd, dentry)))
1692			return -ECHILD;
1693	}
1694	ret = traverse_mounts(path, &jumped, &nd->total_link_count, nd->flags);
1695	if (jumped)
1696		nd->state |= ND_JUMPED;
1697	if (unlikely(ret)) {
1698		dput(path->dentry);
1699		if (path->mnt != nd->path.mnt)
1700			mntput(path->mnt);
1701	}
1702	return ret;
1703}
1704
1705/*
1706 * This looks up the name in dcache and possibly revalidates the found dentry.
1707 * NULL is returned if the dentry does not exist in the cache.
1708 */
1709static struct dentry *lookup_dcache(const struct qstr *name,
1710				    struct dentry *dir,
1711				    unsigned int flags)
1712{
1713	struct dentry *dentry = d_lookup(dir, name);
1714	if (dentry) {
1715		int error = d_revalidate(dir->d_inode, name, dentry, flags);
1716		if (unlikely(error <= 0)) {
1717			if (!error)
1718				d_invalidate(dentry);
1719			dput(dentry);
1720			return ERR_PTR(error);
1721		}
1722	}
1723	return dentry;
1724}
1725
1726/*
1727 * Parent directory has inode locked exclusive.  This is one
1728 * and only case when ->lookup() gets called on non in-lookup
1729 * dentries - as the matter of fact, this only gets called
1730 * when directory is guaranteed to have no in-lookup children
1731 * at all.
1732 * Will return -ENOENT if name isn't found and LOOKUP_CREATE wasn't passed.
1733 * Will return -EEXIST if name is found and LOOKUP_EXCL was passed.
1734 */
1735struct dentry *lookup_one_qstr_excl(const struct qstr *name,
1736				    struct dentry *base, unsigned int flags)
1737{
1738	struct dentry *dentry;
1739	struct dentry *old;
1740	struct inode *dir;
1741
1742	dentry = lookup_dcache(name, base, flags);
1743	if (dentry)
1744		goto found;
1745
1746	/* Don't create child dentry for a dead directory. */
1747	dir = base->d_inode;
1748	if (unlikely(IS_DEADDIR(dir)))
1749		return ERR_PTR(-ENOENT);
1750
1751	dentry = d_alloc(base, name);
1752	if (unlikely(!dentry))
1753		return ERR_PTR(-ENOMEM);
1754
1755	old = dir->i_op->lookup(dir, dentry, flags);
1756	if (unlikely(old)) {
1757		dput(dentry);
1758		dentry = old;
1759	}
1760found:
1761	if (IS_ERR(dentry))
1762		return dentry;
1763	if (d_is_negative(dentry) && !(flags & LOOKUP_CREATE)) {
1764		dput(dentry);
1765		return ERR_PTR(-ENOENT);
1766	}
1767	if (d_is_positive(dentry) && (flags & LOOKUP_EXCL)) {
1768		dput(dentry);
1769		return ERR_PTR(-EEXIST);
1770	}
1771	return dentry;
1772}
1773EXPORT_SYMBOL(lookup_one_qstr_excl);
1774
1775/**
1776 * lookup_fast - do fast lockless (but racy) lookup of a dentry
1777 * @nd: current nameidata
1778 *
1779 * Do a fast, but racy lookup in the dcache for the given dentry, and
1780 * revalidate it. Returns a valid dentry pointer or NULL if one wasn't
1781 * found. On error, an ERR_PTR will be returned.
1782 *
1783 * If this function returns a valid dentry and the walk is no longer
1784 * lazy, the dentry will carry a reference that must later be put. If
1785 * RCU mode is still in force, then this is not the case and the dentry
1786 * must be legitimized before use. If this returns NULL, then the walk
1787 * will no longer be in RCU mode.
1788 */
1789static struct dentry *lookup_fast(struct nameidata *nd)
1790{
1791	struct dentry *dentry, *parent = nd->path.dentry;
1792	int status = 1;
1793
1794	/*
1795	 * Rename seqlock is not required here because in the off chance
1796	 * of a false negative due to a concurrent rename, the caller is
1797	 * going to fall back to non-racy lookup.
1798	 */
1799	if (nd->flags & LOOKUP_RCU) {
1800		dentry = __d_lookup_rcu(parent, &nd->last, &nd->next_seq);
1801		if (unlikely(!dentry)) {
1802			if (!try_to_unlazy(nd))
1803				return ERR_PTR(-ECHILD);
1804			return NULL;
1805		}
1806
1807		/*
1808		 * This sequence count validates that the parent had no
1809		 * changes while we did the lookup of the dentry above.
1810		 */
1811		if (read_seqcount_retry(&parent->d_seq, nd->seq))
1812			return ERR_PTR(-ECHILD);
1813
1814		status = d_revalidate(nd->inode, &nd->last, dentry, nd->flags);
1815		if (likely(status > 0))
1816			return dentry;
1817		if (!try_to_unlazy_next(nd, dentry))
1818			return ERR_PTR(-ECHILD);
1819		if (status == -ECHILD)
1820			/* we'd been told to redo it in non-rcu mode */
1821			status = d_revalidate(nd->inode, &nd->last,
1822					      dentry, nd->flags);
1823	} else {
1824		dentry = __d_lookup(parent, &nd->last);
1825		if (unlikely(!dentry))
1826			return NULL;
1827		status = d_revalidate(nd->inode, &nd->last, dentry, nd->flags);
1828	}
1829	if (unlikely(status <= 0)) {
1830		if (!status)
1831			d_invalidate(dentry);
1832		dput(dentry);
1833		return ERR_PTR(status);
1834	}
1835	return dentry;
1836}
1837
1838/* Fast lookup failed, do it the slow way */
1839static struct dentry *__lookup_slow(const struct qstr *name,
1840				    struct dentry *dir,
1841				    unsigned int flags)
1842{
1843	struct dentry *dentry, *old;
1844	struct inode *inode = dir->d_inode;
1845	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
1846
1847	/* Don't go there if it's already dead */
1848	if (unlikely(IS_DEADDIR(inode)))
1849		return ERR_PTR(-ENOENT);
1850again:
1851	dentry = d_alloc_parallel(dir, name, &wq);
1852	if (IS_ERR(dentry))
1853		return dentry;
1854	if (unlikely(!d_in_lookup(dentry))) {
1855		int error = d_revalidate(inode, name, dentry, flags);
1856		if (unlikely(error <= 0)) {
1857			if (!error) {
1858				d_invalidate(dentry);
1859				dput(dentry);
1860				goto again;
1861			}
1862			dput(dentry);
1863			dentry = ERR_PTR(error);
1864		}
1865	} else {
1866		old = inode->i_op->lookup(inode, dentry, flags);
1867		d_lookup_done(dentry);
1868		if (unlikely(old)) {
1869			dput(dentry);
1870			dentry = old;
1871		}
1872	}
1873	return dentry;
1874}
1875
1876static noinline struct dentry *lookup_slow(const struct qstr *name,
1877				  struct dentry *dir,
1878				  unsigned int flags)
1879{
1880	struct inode *inode = dir->d_inode;
1881	struct dentry *res;
1882	inode_lock_shared(inode);
1883	res = __lookup_slow(name, dir, flags);
1884	inode_unlock_shared(inode);
1885	return res;
1886}
1887
1888static struct dentry *lookup_slow_killable(const struct qstr *name,
1889					   struct dentry *dir,
1890					   unsigned int flags)
1891{
1892	struct inode *inode = dir->d_inode;
1893	struct dentry *res;
1894
1895	if (inode_lock_shared_killable(inode))
1896		return ERR_PTR(-EINTR);
1897	res = __lookup_slow(name, dir, flags);
1898	inode_unlock_shared(inode);
1899	return res;
1900}
1901
1902static inline int may_lookup(struct mnt_idmap *idmap,
1903			     struct nameidata *restrict nd)
1904{
1905	int err, mask;
1906
1907	mask = nd->flags & LOOKUP_RCU ? MAY_NOT_BLOCK : 0;
1908	err = lookup_inode_permission_may_exec(idmap, nd->inode, mask);
1909	if (likely(!err))
1910		return 0;
1911
1912	// If we failed, and we weren't in LOOKUP_RCU, it's final
1913	if (!(nd->flags & LOOKUP_RCU))
1914		return err;
1915
1916	// Drop out of RCU mode to make sure it wasn't transient
1917	if (!try_to_unlazy(nd))
1918		return -ECHILD;	// redo it all non-lazy
1919
1920	if (err != -ECHILD)	// hard error
1921		return err;
1922
1923	return lookup_inode_permission_may_exec(idmap, nd->inode, 0);
1924}
1925
1926static int reserve_stack(struct nameidata *nd, struct path *link)
1927{
1928	if (unlikely(nd->total_link_count++ >= MAXSYMLINKS))
1929		return -ELOOP;
1930
1931	if (likely(nd->depth != EMBEDDED_LEVELS))
1932		return 0;
1933	if (likely(nd->stack != nd->internal))
1934		return 0;
1935	if (likely(nd_alloc_stack(nd)))
1936		return 0;
1937
1938	if (nd->flags & LOOKUP_RCU) {
1939		// we need to grab link before we do unlazy.  And we can't skip
1940		// unlazy even if we fail to grab the link - cleanup needs it
1941		bool grabbed_link = legitimize_path(nd, link, nd->next_seq);
1942
1943		if (!try_to_unlazy(nd) || !grabbed_link)
1944			return -ECHILD;
1945
1946		if (nd_alloc_stack(nd))
1947			return 0;
1948	}
1949	return -ENOMEM;
1950}
1951
1952enum {WALK_TRAILING = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4};
1953
1954static noinline const char *pick_link(struct nameidata *nd, struct path *link,
1955		     struct inode *inode, int flags)
1956{
1957	struct saved *last;
1958	const char *res;
1959	int error;
1960
1961	if (nd->flags & LOOKUP_RCU) {
1962		/* make sure that d_is_symlink from step_into_slowpath() matches the inode */
1963		if (read_seqcount_retry(&link->dentry->d_seq, nd->next_seq))
1964			return ERR_PTR(-ECHILD);
1965	} else {
1966		if (link->mnt == nd->path.mnt)
1967			mntget(link->mnt);
1968	}
1969
1970	error = reserve_stack(nd, link);
1971	if (unlikely(error)) {
1972		if (!(nd->flags & LOOKUP_RCU))
1973			path_put(link);
1974		return ERR_PTR(error);
1975	}
1976	last = nd->stack + nd->depth++;
1977	last->link = *link;
1978	clear_delayed_call(&last->done);
1979	last->seq = nd->next_seq;
1980
1981	if (flags & WALK_TRAILING) {
1982		error = may_follow_link(nd, inode);
1983		if (unlikely(error))
1984			return ERR_PTR(error);
1985	}
1986
1987	if (unlikely(nd->flags & LOOKUP_NO_SYMLINKS) ||
1988			unlikely(link->mnt->mnt_flags & MNT_NOSYMFOLLOW))
1989		return ERR_PTR(-ELOOP);
1990
1991	if (unlikely(atime_needs_update(&last->link, inode))) {
1992		if (nd->flags & LOOKUP_RCU) {
1993			if (!try_to_unlazy(nd))
1994				return ERR_PTR(-ECHILD);
1995		}
1996		touch_atime(&last->link);
1997		cond_resched();
1998	}
1999
2000	error = security_inode_follow_link(link->dentry, inode,
2001					   nd->flags & LOOKUP_RCU);
2002	if (unlikely(error))
2003		return ERR_PTR(error);
2004
2005	res = READ_ONCE(inode->i_link);
2006	if (!res) {
2007		const char * (*get)(struct dentry *, struct inode *,
2008				struct delayed_call *);
2009		get = inode->i_op->get_link;
2010		if (nd->flags & LOOKUP_RCU) {
2011			res = get(NULL, inode, &last->done);
2012			if (res == ERR_PTR(-ECHILD) && try_to_unlazy(nd))
2013				res = get(link->dentry, inode, &last->done);
2014		} else {
2015			res = get(link->dentry, inode, &last->done);
2016		}
2017		if (!res)
2018			goto all_done;
2019		if (IS_ERR(res))
2020			return res;
2021	}
2022	if (*res == '/') {
2023		error = nd_jump_root(nd);
2024		if (unlikely(error))
2025			return ERR_PTR(error);
2026		while (unlikely(*++res == '/'))
2027			;
2028	}
2029	if (*res)
2030		return res;
2031all_done: // pure jump
2032	put_link(nd);
2033	return NULL;
2034}
2035
2036/*
2037 * Do we need to follow links? We _really_ want to be able
2038 * to do this check without having to look at inode->i_op,
2039 * so we keep a cache of "no, this doesn't need follow_link"
2040 * for the common case.
2041 *
2042 * NOTE: dentry must be what nd->next_seq had been sampled from.
2043 */
2044static noinline const char *step_into_slowpath(struct nameidata *nd, int flags,
2045		     struct dentry *dentry)
2046{
2047	struct path path;
2048	struct inode *inode;
2049	int err;
2050
2051	err = handle_mounts(nd, dentry, &path);
2052	if (unlikely(err < 0))
2053		return ERR_PTR(err);
2054	inode = path.dentry->d_inode;
2055	if (likely(!d_is_symlink(path.dentry)) ||
2056	   ((flags & WALK_TRAILING) && !(nd->flags & LOOKUP_FOLLOW)) ||
2057	   (flags & WALK_NOFOLLOW)) {
2058		/* not a symlink or should not follow */
2059		if (nd->flags & LOOKUP_RCU) {
2060			if (read_seqcount_retry(&path.dentry->d_seq, nd->next_seq))
2061				return ERR_PTR(-ECHILD);
2062			if (unlikely(!inode))
2063				return ERR_PTR(-ENOENT);
2064		} else {
2065			dput(nd->path.dentry);
2066			if (nd->path.mnt != path.mnt)
2067				mntput(nd->path.mnt);
2068		}
2069		nd->path = path;
2070		nd->inode = inode;
2071		nd->seq = nd->next_seq;
2072		return NULL;
2073	}
2074	return pick_link(nd, &path, inode, flags);
2075}
2076
2077static __always_inline const char *step_into(struct nameidata *nd, int flags,
2078                    struct dentry *dentry)
2079{
2080	/*
2081	 * In the common case we are in rcu-walk and traversing over a non-mounted on
2082	 * directory (as opposed to e.g., a symlink).
2083	 *
2084	 * We can handle that and negative entries with the checks below.
2085	 */
2086	if (likely((nd->flags & LOOKUP_RCU) &&
2087	    !d_managed(dentry) && !d_is_symlink(dentry))) {
2088		struct inode *inode = dentry->d_inode;
2089		if (read_seqcount_retry(&dentry->d_seq, nd->next_seq))
2090			return ERR_PTR(-ECHILD);
2091		if (unlikely(!inode))
2092			return ERR_PTR(-ENOENT);
2093		nd->path.dentry = dentry;
2094		/* nd->path.mnt is retained on purpose */
2095		nd->inode = inode;
2096		nd->seq = nd->next_seq;
2097		return NULL;
2098	}
2099	return step_into_slowpath(nd, flags, dentry);
2100}
2101
2102static struct dentry *follow_dotdot_rcu(struct nameidata *nd)
2103{
2104	struct dentry *parent, *old;
2105
2106	if (path_equal(&nd->path, &nd->root))
2107		goto in_root;
2108	if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) {
2109		struct path path;
2110		unsigned seq;
2111		if (!choose_mountpoint_rcu(real_mount(nd->path.mnt),
2112					   &nd->root, &path, &seq))
2113			goto in_root;
2114		if (unlikely(nd->flags & LOOKUP_NO_XDEV))
2115			return ERR_PTR(-ECHILD);
2116		nd->path = path;
2117		nd->inode = path.dentry->d_inode;
2118		nd->seq = seq;
2119		// makes sure that non-RCU pathwalk could reach this state
2120		if (read_seqretry(&mount_lock, nd->m_seq))
2121			return ERR_PTR(-ECHILD);
2122		/* we know that mountpoint was pinned */
2123	}
2124	old = nd->path.dentry;
2125	parent = old->d_parent;
2126	nd->next_seq = read_seqcount_begin(&parent->d_seq);
2127	// makes sure that non-RCU pathwalk could reach this state
2128	if (read_seqcount_retry(&old->d_seq, nd->seq))
2129		return ERR_PTR(-ECHILD);
2130	if (unlikely(!path_connected(nd->path.mnt, parent)))
2131		return ERR_PTR(-ECHILD);
2132	return parent;
2133in_root:
2134	if (read_seqretry(&mount_lock, nd->m_seq))
2135		return ERR_PTR(-ECHILD);
2136	if (unlikely(nd->flags & LOOKUP_BENEATH))
2137		return ERR_PTR(-ECHILD);
2138	nd->next_seq = nd->seq;
2139	return nd->path.dentry;
2140}
2141
2142static struct dentry *follow_dotdot(struct nameidata *nd)
2143{
2144	struct dentry *parent;
2145
2146	if (path_equal(&nd->path, &nd->root))
2147		goto in_root;
2148	if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) {
2149		struct path path;
2150
2151		if (!choose_mountpoint(real_mount(nd->path.mnt),
2152				       &nd->root, &path))
2153			goto in_root;
2154		path_put(&nd->path);
2155		nd->path = path;
2156		nd->inode = path.dentry->d_inode;
2157		if (unlikely(nd->flags & LOOKUP_NO_XDEV))
2158			return ERR_PTR(-EXDEV);
2159	}
2160	/* rare case of legitimate dget_parent()... */
2161	parent = dget_parent(nd->path.dentry);
2162	if (unlikely(!path_connected(nd->path.mnt, parent))) {
2163		dput(parent);
2164		return ERR_PTR(-ENOENT);
2165	}
2166	return parent;
2167
2168in_root:
2169	if (unlikely(nd->flags & LOOKUP_BENEATH))
2170		return ERR_PTR(-EXDEV);
2171	return dget(nd->path.dentry);
2172}
2173
2174static const char *handle_dots(struct nameidata *nd, int type)
2175{
2176	if (type == LAST_DOTDOT) {
2177		const char *error = NULL;
2178		struct dentry *parent;
2179
2180		if (!nd->root.mnt) {
2181			error = ERR_PTR(set_root(nd));
2182			if (unlikely(error))
2183				return error;
2184		}
2185		if (nd->flags & LOOKUP_RCU)
2186			parent = follow_dotdot_rcu(nd);
2187		else
2188			parent = follow_dotdot(nd);
2189		if (IS_ERR(parent))
2190			return ERR_CAST(parent);
2191		error = step_into(nd, WALK_NOFOLLOW, parent);
2192		if (unlikely(error))
2193			return error;
2194
2195		if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) {
2196			/*
2197			 * If there was a racing rename or mount along our
2198			 * path, then we can't be sure that ".." hasn't jumped
2199			 * above nd->root (and so userspace should retry or use
2200			 * some fallback).
2201			 */
2202			smp_rmb();
2203			if (__read_seqcount_retry(&mount_lock.seqcount, nd->m_seq))
2204				return ERR_PTR(-EAGAIN);
2205			if (__read_seqcount_retry(&rename_lock.seqcount, nd->r_seq))
2206				return ERR_PTR(-EAGAIN);
2207		}
2208	}
2209	return NULL;
2210}
2211
2212static __always_inline const char *walk_component(struct nameidata *nd, int flags)
2213{
2214	struct dentry *dentry;
2215	/*
2216	 * "." and ".." are special - ".." especially so because it has
2217	 * to be able to know about the current root directory and
2218	 * parent relationships.
2219	 */
2220	if (unlikely(nd->last_type != LAST_NORM)) {
2221		if (unlikely(nd->depth) && !(flags & WALK_MORE))
2222			put_link(nd);
2223		return handle_dots(nd, nd->last_type);
2224	}
2225	dentry = lookup_fast(nd);
2226	if (IS_ERR(dentry))
2227		return ERR_CAST(dentry);
2228	if (unlikely(!dentry)) {
2229		dentry = lookup_slow(&nd->last, nd->path.dentry, nd->flags);
2230		if (IS_ERR(dentry))
2231			return ERR_CAST(dentry);
2232	}
2233	if (unlikely(nd->depth) && !(flags & WALK_MORE))
2234		put_link(nd);
2235	return step_into(nd, flags, dentry);
2236}
2237
2238/*
2239 * We can do the critical dentry name comparison and hashing
2240 * operations one word at a time, but we are limited to:
2241 *
2242 * - Architectures with fast unaligned word accesses. We could
2243 *   do a "get_unaligned()" if this helps and is sufficiently
2244 *   fast.
2245 *
2246 * - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we
2247 *   do not trap on the (extremely unlikely) case of a page
2248 *   crossing operation.
2249 *
2250 * - Furthermore, we need an efficient 64-bit compile for the
2251 *   64-bit case in order to generate the "number of bytes in
2252 *   the final mask". Again, that could be replaced with a
2253 *   efficient population count instruction or similar.
2254 */
2255#ifdef CONFIG_DCACHE_WORD_ACCESS
2256
2257#include <asm/word-at-a-time.h>
2258
2259#ifdef HASH_MIX
2260
2261/* Architecture provides HASH_MIX and fold_hash() in <asm/hash.h> */
2262
2263#elif defined(CONFIG_64BIT)
2264/*
2265 * Register pressure in the mixing function is an issue, particularly
2266 * on 32-bit x86, but almost any function requires one state value and
2267 * one temporary.  Instead, use a function designed for two state values
2268 * and no temporaries.
2269 *
2270 * This function cannot create a collision in only two iterations, so
2271 * we have two iterations to achieve avalanche.  In those two iterations,
2272 * we have six layers of mixing, which is enough to spread one bit's
2273 * influence out to 2^6 = 64 state bits.
2274 *
2275 * Rotate constants are scored by considering either 64 one-bit input
2276 * deltas or 64*63/2 = 2016 two-bit input deltas, and finding the
2277 * probability of that delta causing a change to each of the 128 output
2278 * bits, using a sample of random initial states.
2279 *
2280 * The Shannon entropy of the computed probabilities is then summed
2281 * to produce a score.  Ideally, any input change has a 50% chance of
2282 * toggling any given output bit.
2283 *
2284 * Mixing scores (in bits) for (12,45):
2285 * Input delta: 1-bit      2-bit
2286 * 1 round:     713.3    42542.6
2287 * 2 rounds:   2753.7   140389.8
2288 * 3 rounds:   5954.1   233458.2
2289 * 4 rounds:   7862.6   256672.2
2290 * Perfect:    8192     258048
2291 *            (64*128) (64*63/2 * 128)
2292 */
2293#define HASH_MIX(x, y, a)	\
2294	(	x ^= (a),	\
2295	y ^= x,	x = rol64(x,12),\
2296	x += y,	y = rol64(y,45),\
2297	y *= 9			)
2298
2299/*
2300 * Fold two longs into one 32-bit hash value.  This must be fast, but
2301 * latency isn't quite as critical, as there is a fair bit of additional
2302 * work done before the hash value is used.
2303 */
2304static inline unsigned int fold_hash(unsigned long x, unsigned long y)
2305{
2306	y ^= x * GOLDEN_RATIO_64;
2307	y *= GOLDEN_RATIO_64;
2308	return y >> 32;
2309}
2310
2311#else	/* 32-bit case */
2312
2313/*
2314 * Mixing scores (in bits) for (7,20):
2315 * Input delta: 1-bit      2-bit
2316 * 1 round:     330.3     9201.6
2317 * 2 rounds:   1246.4    25475.4
2318 * 3 rounds:   1907.1    31295.1
2319 * 4 rounds:   2042.3    31718.6
2320 * Perfect:    2048      31744
2321 *            (32*64)   (32*31/2 * 64)
2322 */
2323#define HASH_MIX(x, y, a)	\
2324	(	x ^= (a),	\
2325	y ^= x,	x = rol32(x, 7),\
2326	x += y,	y = rol32(y,20),\
2327	y *= 9			)
2328
2329static inline unsigned int fold_hash(unsigned long x, unsigned long y)
2330{
2331	/* Use arch-optimized multiply if one exists */
2332	return __hash_32(y ^ __hash_32(x));
2333}
2334
2335#endif
2336
2337/*
2338 * Return the hash of a string of known length.  This is carfully
2339 * designed to match hash_name(), which is the more critical function.
2340 * In particular, we must end by hashing a final word containing 0..7
2341 * payload bytes, to match the way that hash_name() iterates until it
2342 * finds the delimiter after the name.
2343 */
2344unsigned int full_name_hash(const void *salt, const char *name, unsigned int len)
2345{
2346	unsigned long a, x = 0, y = (unsigned long)salt;
2347
2348	for (;;) {
2349		if (!len)
2350			goto done;
2351		a = load_unaligned_zeropad(name);
2352		if (len < sizeof(unsigned long))
2353			break;
2354		HASH_MIX(x, y, a);
2355		name += sizeof(unsigned long);
2356		len -= sizeof(unsigned long);
2357	}
2358	x ^= a & bytemask_from_count(len);
2359done:
2360	return fold_hash(x, y);
2361}
2362EXPORT_SYMBOL(full_name_hash);
2363
2364/* Return the "hash_len" (hash and length) of a null-terminated string */
2365u64 hashlen_string(const void *salt, const char *name)
2366{
2367	unsigned long a = 0, x = 0, y = (unsigned long)salt;
2368	unsigned long adata, mask, len;
2369	const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
2370
2371	len = 0;
2372	goto inside;
2373
2374	do {
2375		HASH_MIX(x, y, a);
2376		len += sizeof(unsigned long);
2377inside:
2378		a = load_unaligned_zeropad(name+len);
2379	} while (!has_zero(a, &adata, &constants));
2380
2381	adata = prep_zero_mask(a, adata, &constants);
2382	mask = create_zero_mask(adata);
2383	x ^= a & zero_bytemask(mask);
2384
2385	return hashlen_create(fold_hash(x, y), len + find_zero(mask));
2386}
2387EXPORT_SYMBOL(hashlen_string);
2388
2389/*
2390 * Calculate the length and hash of the path component, and
2391 * return the length as the result.
2392 */
2393static inline const char *hash_name(struct nameidata *nd,
2394				    const char *name,
2395				    unsigned long *lastword)
2396{
2397	unsigned long a, b, x, y = (unsigned long)nd->path.dentry;
2398	unsigned long adata, bdata, mask, len;
2399	const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
2400
2401	/*
2402	 * The first iteration is special, because it can result in
2403	 * '.' and '..' and has no mixing other than the final fold.
2404	 */
2405	a = load_unaligned_zeropad(name);
2406	b = a ^ REPEAT_BYTE('/');
2407	if (has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants)) {
2408		adata = prep_zero_mask(a, adata, &constants);
2409		bdata = prep_zero_mask(b, bdata, &constants);
2410		mask = create_zero_mask(adata | bdata);
2411		a &= zero_bytemask(mask);
2412		*lastword = a;
2413		len = find_zero(mask);
2414		nd->last.hash = fold_hash(a, y);
2415		nd->last.len = len;
2416		return name + len;
2417	}
2418
2419	len = 0;
2420	x = 0;
2421	do {
2422		HASH_MIX(x, y, a);
2423		len += sizeof(unsigned long);
2424		a = load_unaligned_zeropad(name+len);
2425		b = a ^ REPEAT_BYTE('/');
2426	} while (!(has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants)));
2427
2428	adata = prep_zero_mask(a, adata, &constants);
2429	bdata = prep_zero_mask(b, bdata, &constants);
2430	mask = create_zero_mask(adata | bdata);
2431	a &= zero_bytemask(mask);
2432	x ^= a;
2433	len += find_zero(mask);
2434	*lastword = 0;		// Multi-word components cannot be DOT or DOTDOT
2435
2436	nd->last.hash = fold_hash(x, y);
2437	nd->last.len = len;
2438	return name + len;
2439}
2440
2441/*
2442 * Note that the 'last' word is always zero-masked, but
2443 * was loaded as a possibly big-endian word.
2444 */
2445#ifdef __BIG_ENDIAN
2446  #define LAST_WORD_IS_DOT	(0x2eul << (BITS_PER_LONG-8))
2447  #define LAST_WORD_IS_DOTDOT	(0x2e2eul << (BITS_PER_LONG-16))
2448#endif
2449
2450#else	/* !CONFIG_DCACHE_WORD_ACCESS: Slow, byte-at-a-time version */
2451
2452/* Return the hash of a string of known length */
2453unsigned int full_name_hash(const void *salt, const char *name, unsigned int len)
2454{
2455	unsigned long hash = init_name_hash(salt);
2456	while (len--)
2457		hash = partial_name_hash((unsigned char)*name++, hash);
2458	return end_name_hash(hash);
2459}
2460EXPORT_SYMBOL(full_name_hash);
2461
2462/* Return the "hash_len" (hash and length) of a null-terminated string */
2463u64 hashlen_string(const void *salt, const char *name)
2464{
2465	unsigned long hash = init_name_hash(salt);
2466	unsigned long len = 0, c;
2467
2468	c = (unsigned char)*name;
2469	while (c) {
2470		len++;
2471		hash = partial_name_hash(c, hash);
2472		c = (unsigned char)name[len];
2473	}
2474	return hashlen_create(end_name_hash(hash), len);
2475}
2476EXPORT_SYMBOL(hashlen_string);
2477
2478/*
2479 * We know there's a real path component here of at least
2480 * one character.
2481 */
2482static inline const char *hash_name(struct nameidata *nd, const char *name, unsigned long *lastword)
2483{
2484	unsigned long hash = init_name_hash(nd->path.dentry);
2485	unsigned long len = 0, c, last = 0;
2486
2487	c = (unsigned char)*name;
2488	do {
2489		last = (last << 8) + c;
2490		len++;
2491		hash = partial_name_hash(c, hash);
2492		c = (unsigned char)name[len];
2493	} while (c && c != '/');
2494
2495	// This is reliable for DOT or DOTDOT, since the component
2496	// cannot contain NUL characters - top bits being zero means
2497	// we cannot have had any other pathnames.
2498	*lastword = last;
2499	nd->last.hash = end_name_hash(hash);
2500	nd->last.len = len;
2501	return name + len;
2502}
2503
2504#endif
2505
2506#ifndef LAST_WORD_IS_DOT
2507  #define LAST_WORD_IS_DOT	0x2e
2508  #define LAST_WORD_IS_DOTDOT	0x2e2e
2509#endif
2510
2511/*
2512 * Name resolution.
2513 * This is the basic name resolution function, turning a pathname into
2514 * the final dentry. We expect 'base' to be positive and a directory.
2515 *
2516 * Returns 0 and nd will have valid dentry and mnt on success.
2517 * Returns error and drops reference to input namei data on failure.
2518 */
2519static int link_path_walk(const char *name, struct nameidata *nd)
2520{
2521	int depth = 0; // depth <= nd->depth
2522	int err;
2523
2524	nd->last_type = LAST_ROOT;
2525	nd->flags |= LOOKUP_PARENT;
2526	if (IS_ERR(name))
2527		return PTR_ERR(name);
2528	if (*name == '/') {
2529		do {
2530			name++;
2531		} while (unlikely(*name == '/'));
2532	}
2533	if (unlikely(!*name)) {
2534		nd->dir_mode = 0; // short-circuit the 'hardening' idiocy
2535		return 0;
2536	}
2537
2538	/* At this point we know we have a real path component. */
2539	for(;;) {
2540		struct mnt_idmap *idmap;
2541		const char *link;
2542		unsigned long lastword;
2543
2544		idmap = mnt_idmap(nd->path.mnt);
2545		err = may_lookup(idmap, nd);
2546		if (unlikely(err))
2547			return err;
2548
2549		nd->last.name = name;
2550		name = hash_name(nd, name, &lastword);
2551
2552		switch(lastword) {
2553		case LAST_WORD_IS_DOTDOT:
2554			nd->last_type = LAST_DOTDOT;
2555			nd->state |= ND_JUMPED;
2556			break;
2557
2558		case LAST_WORD_IS_DOT:
2559			nd->last_type = LAST_DOT;
2560			break;
2561
2562		default:
2563			nd->last_type = LAST_NORM;
2564			nd->state &= ~ND_JUMPED;
2565
2566			struct dentry *parent = nd->path.dentry;
2567			if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
2568				err = parent->d_op->d_hash(parent, &nd->last);
2569				if (err < 0)
2570					return err;
2571			}
2572		}
2573
2574		if (!*name)
2575			goto OK;
2576		/*
2577		 * If it wasn't NUL, we know it was '/'. Skip that
2578		 * slash, and continue until no more slashes.
2579		 */
2580		do {
2581			name++;
2582		} while (unlikely(*name == '/'));
2583		if (unlikely(!*name)) {
2584OK:
2585			/* pathname or trailing symlink, done */
2586			if (likely(!depth)) {
2587				nd->dir_vfsuid = i_uid_into_vfsuid(idmap, nd->inode);
2588				nd->dir_mode = nd->inode->i_mode;
2589				nd->flags &= ~LOOKUP_PARENT;
2590				return 0;
2591			}
2592			/* last component of nested symlink */
2593			name = nd->stack[--depth].name;
2594			link = walk_component(nd, 0);
2595		} else {
2596			/* not the last component */
2597			link = walk_component(nd, WALK_MORE);
2598		}
2599		if (unlikely(link)) {
2600			if (IS_ERR(link))
2601				return PTR_ERR(link);
2602			/* a symlink to follow */
2603			nd->stack[depth++].name = name;
2604			name = link;
2605			continue;
2606		}
2607		if (unlikely(!d_can_lookup(nd->path.dentry))) {
2608			if (nd->flags & LOOKUP_RCU) {
2609				if (!try_to_unlazy(nd))
2610					return -ECHILD;
2611			}
2612			return -ENOTDIR;
2613		}
2614	}
2615}
2616
2617/* must be paired with terminate_walk() */
2618static const char *path_init(struct nameidata *nd, unsigned flags)
2619{
2620	int error;
2621	const char *s = nd->pathname;
2622
2623	/* LOOKUP_CACHED requires RCU, ask caller to retry */
2624	if (unlikely((flags & (LOOKUP_RCU | LOOKUP_CACHED)) == LOOKUP_CACHED))
2625		return ERR_PTR(-EAGAIN);
2626
2627	if (unlikely(!*s))
2628		flags &= ~LOOKUP_RCU;
2629	if (flags & LOOKUP_RCU)
2630		rcu_read_lock();
2631	else
2632		nd->seq = nd->next_seq = 0;
2633
2634	nd->flags = flags;
2635	nd->state |= ND_JUMPED;
2636
2637	nd->m_seq = __read_seqcount_begin(&mount_lock.seqcount);
2638	nd->r_seq = __read_seqcount_begin(&rename_lock.seqcount);
2639	smp_rmb();
2640
2641	if (unlikely(nd->state & ND_ROOT_PRESET)) {
2642		struct dentry *root = nd->root.dentry;
2643		struct inode *inode = root->d_inode;
2644		if (*s && unlikely(!d_can_lookup(root)))
2645			return ERR_PTR(-ENOTDIR);
2646		nd->path = nd->root;
2647		nd->inode = inode;
2648		if (flags & LOOKUP_RCU) {
2649			nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
2650			nd->root_seq = nd->seq;
2651		} else {
2652			path_get(&nd->path);
2653		}
2654		return s;
2655	}
2656
2657	nd->root.mnt = NULL;
2658
2659	/* Absolute pathname -- fetch the root (LOOKUP_IN_ROOT uses nd->dfd). */
2660	if (*s == '/' && likely(!(flags & LOOKUP_IN_ROOT))) {
2661		error = nd_jump_root(nd);
2662		if (unlikely(error))
2663			return ERR_PTR(error);
2664		return s;
2665	}
2666
2667	/* Relative pathname -- get the starting-point it is relative to. */
2668	if (nd->dfd == AT_FDCWD) {
2669		if (flags & LOOKUP_RCU) {
2670			struct fs_struct *fs = current->fs;
2671			unsigned seq;
2672
2673			do {
2674				seq = read_seqbegin(&fs->seq);
2675				nd->path = fs->pwd;
2676				nd->inode = nd->path.dentry->d_inode;
2677				nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
2678			} while (read_seqretry(&fs->seq, seq));
2679		} else {
2680			get_fs_pwd(current->fs, &nd->path);
2681			nd->inode = nd->path.dentry->d_inode;
2682		}
2683	} else {
2684		/* Caller must check execute permissions on the starting path component */
2685		CLASS(fd_raw, f)(nd->dfd);
2686		struct dentry *dentry;
2687
2688		if (fd_empty(f))
2689			return ERR_PTR(-EBADF);
2690
2691		if (flags & LOOKUP_LINKAT_EMPTY) {
2692			if (fd_file(f)->f_cred != current_cred() &&
2693			    !ns_capable(fd_file(f)->f_cred->user_ns, CAP_DAC_READ_SEARCH))
2694				return ERR_PTR(-ENOENT);
2695		}
2696
2697		dentry = fd_file(f)->f_path.dentry;
2698
2699		if (*s && unlikely(!d_can_lookup(dentry)))
2700			return ERR_PTR(-ENOTDIR);
2701
2702		nd->path = fd_file(f)->f_path;
2703		if (flags & LOOKUP_RCU) {
2704			nd->inode = nd->path.dentry->d_inode;
2705			nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
2706		} else {
2707			path_get(&nd->path);
2708			nd->inode = nd->path.dentry->d_inode;
2709		}
2710	}
2711
2712	/* For scoped-lookups we need to set the root to the dirfd as well. */
2713	if (unlikely(flags & LOOKUP_IS_SCOPED)) {
2714		nd->root = nd->path;
2715		if (flags & LOOKUP_RCU) {
2716			nd->root_seq = nd->seq;
2717		} else {
2718			path_get(&nd->root);
2719			nd->state |= ND_ROOT_GRABBED;
2720		}
2721	}
2722	return s;
2723}
2724
2725static inline const char *lookup_last(struct nameidata *nd)
2726{
2727	if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len])
2728		nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
2729
2730	return walk_component(nd, WALK_TRAILING);
2731}
2732
2733static int handle_lookup_down(struct nameidata *nd)
2734{
2735	if (!(nd->flags & LOOKUP_RCU))
2736		dget(nd->path.dentry);
2737	nd->next_seq = nd->seq;
2738	return PTR_ERR(step_into(nd, WALK_NOFOLLOW, nd->path.dentry));
2739}
2740
2741/* Returns 0 and nd will be valid on success; Returns error, otherwise. */
2742static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path)
2743{
2744	const char *s = path_init(nd, flags);
2745	int err;
2746
2747	if (unlikely(flags & LOOKUP_DOWN) && !IS_ERR(s)) {
2748		err = handle_lookup_down(nd);
2749		if (unlikely(err < 0))
2750			s = ERR_PTR(err);
2751	}
2752
2753	while (!(err = link_path_walk(s, nd)) &&
2754	       (s = lookup_last(nd)) != NULL)
2755		;
2756	if (!err && unlikely(nd->flags & LOOKUP_MOUNTPOINT)) {
2757		err = handle_lookup_down(nd);
2758		nd->state &= ~ND_JUMPED; // no d_weak_revalidate(), please...
2759	}
2760	if (!err)
2761		err = complete_walk(nd);
2762
2763	if (!err && nd->flags & LOOKUP_DIRECTORY)
2764		if (!d_can_lookup(nd->path.dentry))
2765			err = -ENOTDIR;
2766	if (!err) {
2767		*path = nd->path;
2768		nd->path.mnt = NULL;
2769		nd->path.dentry = NULL;
2770	}
2771	terminate_walk(nd);
2772	return err;
2773}
2774
2775int filename_lookup(int dfd, struct filename *name, unsigned flags,
2776		    struct path *path, const struct path *root)
2777{
2778	int retval;
2779	struct nameidata nd;
2780	if (IS_ERR(name))
2781		return PTR_ERR(name);
2782	set_nameidata(&nd, dfd, name, root);
2783	retval = path_lookupat(&nd, flags | LOOKUP_RCU, path);
2784	if (unlikely(retval == -ECHILD))
2785		retval = path_lookupat(&nd, flags, path);
2786	if (unlikely(retval == -ESTALE))
2787		retval = path_lookupat(&nd, flags | LOOKUP_REVAL, path);
2788
2789	if (likely(!retval))
2790		audit_inode(name, path->dentry,
2791			    flags & LOOKUP_MOUNTPOINT ? AUDIT_INODE_NOEVAL : 0);
2792	restore_nameidata();
2793	return retval;
2794}
2795
2796/* Returns 0 and nd will be valid on success; Returns error, otherwise. */
2797static int path_parentat(struct nameidata *nd, unsigned flags,
2798				struct path *parent)
2799{
2800	const char *s = path_init(nd, flags);
2801	int err = link_path_walk(s, nd);
2802	if (!err)
2803		err = complete_walk(nd);
2804	if (!err) {
2805		*parent = nd->path;
2806		nd->path.mnt = NULL;
2807		nd->path.dentry = NULL;
2808	}
2809	terminate_walk(nd);
2810	return err;
2811}
2812
2813/* Note: this does not consume "name" */
2814static int __filename_parentat(int dfd, struct filename *name,
2815			       unsigned int flags, struct path *parent,
2816			       struct qstr *last, int *type,
2817			       const struct path *root)
2818{
2819	int retval;
2820	struct nameidata nd;
2821
2822	if (IS_ERR(name))
2823		return PTR_ERR(name);
2824	set_nameidata(&nd, dfd, name, root);
2825	retval = path_parentat(&nd, flags | LOOKUP_RCU, parent);
2826	if (unlikely(retval == -ECHILD))
2827		retval = path_parentat(&nd, flags, parent);
2828	if (unlikely(retval == -ESTALE))
2829		retval = path_parentat(&nd, flags | LOOKUP_REVAL, parent);
2830	if (likely(!retval)) {
2831		*last = nd.last;
2832		*type = nd.last_type;
2833		audit_inode(name, parent->dentry, AUDIT_INODE_PARENT);
2834	}
2835	restore_nameidata();
2836	return retval;
2837}
2838
2839static int filename_parentat(int dfd, struct filename *name,
2840			     unsigned int flags, struct path *parent,
2841			     struct qstr *last, int *type)
2842{
2843	return __filename_parentat(dfd, name, flags, parent, last, type, NULL);
2844}
2845
2846/**
2847 * __start_dirop - begin a create or remove dirop, performing locking and lookup
2848 * @parent:       the dentry of the parent in which the operation will occur
2849 * @name:         a qstr holding the name within that parent
2850 * @lookup_flags: intent and other lookup flags.
2851 * @state:        task state bitmask
2852 *
2853 * The lookup is performed and necessary locks are taken so that, on success,
2854 * the returned dentry can be operated on safely.
2855 * The qstr must already have the hash value calculated.
2856 *
2857 * Returns: a locked dentry, or an error.
2858 *
2859 */
2860static struct dentry *__start_dirop(struct dentry *parent, struct qstr *name,
2861				    unsigned int lookup_flags,
2862				    unsigned int state)
2863{
2864	struct dentry *dentry;
2865	struct inode *dir = d_inode(parent);
2866
2867	if (state == TASK_KILLABLE) {
2868		int ret = down_write_killable_nested(&dir->i_rwsem,
2869						     I_MUTEX_PARENT);
2870		if (ret)
2871			return ERR_PTR(ret);
2872	} else {
2873		inode_lock_nested(dir, I_MUTEX_PARENT);
2874	}
2875	dentry = lookup_one_qstr_excl(name, parent, lookup_flags);
2876	if (IS_ERR(dentry))
2877		inode_unlock(dir);
2878	return dentry;
2879}
2880
2881struct dentry *start_dirop(struct dentry *parent, struct qstr *name,
2882			   unsigned int lookup_flags)
2883{
2884	return __start_dirop(parent, name, lookup_flags, TASK_NORMAL);
2885}
2886
2887/**
2888 * end_dirop - signal completion of a dirop
2889 * @de: the dentry which was returned by start_dirop or similar.
2890 *
2891 * If the de is an error, nothing happens. Otherwise any lock taken to
2892 * protect the dentry is dropped and the dentry itself is release (dput()).
2893 */
2894void end_dirop(struct dentry *de)
2895{
2896	if (!IS_ERR(de)) {
2897		inode_unlock(de->d_parent->d_inode);
2898		dput(de);
2899	}
2900}
2901EXPORT_SYMBOL(end_dirop);
2902
2903/* does lookup, returns the object with parent locked */
2904static struct dentry *__start_removing_path(int dfd, struct filename *name,
2905					   struct path *path)
2906{
2907	struct path parent_path __free(path_put) = {};
2908	struct dentry *d;
2909	struct qstr last;
2910	int type, error;
2911
2912	error = filename_parentat(dfd, name, 0, &parent_path, &last, &type);
2913	if (error)
2914		return ERR_PTR(error);
2915	if (unlikely(type != LAST_NORM))
2916		return ERR_PTR(-EINVAL);
2917	/* don't fail immediately if it's r/o, at least try to report other errors */
2918	error = mnt_want_write(parent_path.mnt);
2919	d = start_dirop(parent_path.dentry, &last, 0);
2920	if (IS_ERR(d))
2921		goto drop;
2922	if (error)
2923		goto fail;
2924	path->dentry = no_free_ptr(parent_path.dentry);
2925	path->mnt = no_free_ptr(parent_path.mnt);
2926	return d;
2927
2928fail:
2929	end_dirop(d);
2930	d = ERR_PTR(error);
2931drop:
2932	if (!error)
2933		mnt_drop_write(parent_path.mnt);
2934	return d;
2935}
2936
2937/**
2938 * kern_path_parent: lookup path returning parent and target
2939 * @name: path name
2940 * @path: path to store parent in
2941 *
2942 * The path @name should end with a normal component, not "." or ".." or "/".
2943 * A lookup is performed and if successful the parent information
2944 * is store in @parent and the dentry is returned.
2945 *
2946 * The dentry maybe negative, the parent will be positive.
2947 *
2948 * Returns:  dentry or error.
2949 */
2950struct dentry *kern_path_parent(const char *name, struct path *path)
2951{
2952	struct path parent_path __free(path_put) = {};
2953	struct filename *filename __free(putname) = getname_kernel(name);
2954	struct dentry *d;
2955	struct qstr last;
2956	int type, error;
2957
2958	error = filename_parentat(AT_FDCWD, filename, 0, &parent_path, &last, &type);
2959	if (error)
2960		return ERR_PTR(error);
2961	if (unlikely(type != LAST_NORM))
2962		return ERR_PTR(-EINVAL);
2963
2964	d = lookup_noperm_unlocked(&last, parent_path.dentry);
2965	if (IS_ERR(d))
2966		return d;
2967	path->dentry = no_free_ptr(parent_path.dentry);
2968	path->mnt = no_free_ptr(parent_path.mnt);
2969	return d;
2970}
2971
2972struct dentry *start_removing_path(const char *name, struct path *path)
2973{
2974	struct filename *filename = getname_kernel(name);
2975	struct dentry *res = __start_removing_path(AT_FDCWD, filename, path);
2976
2977	putname(filename);
2978	return res;
2979}
2980
2981struct dentry *start_removing_user_path_at(int dfd,
2982					   const char __user *name,
2983					   struct path *path)
2984{
2985	struct filename *filename = getname(name);
2986	struct dentry *res = __start_removing_path(dfd, filename, path);
2987
2988	putname(filename);
2989	return res;
2990}
2991EXPORT_SYMBOL(start_removing_user_path_at);
2992
2993int kern_path(const char *name, unsigned int flags, struct path *path)
2994{
2995	struct filename *filename = getname_kernel(name);
2996	int ret = filename_lookup(AT_FDCWD, filename, flags, path, NULL);
2997
2998	putname(filename);
2999	return ret;
3000
3001}
3002EXPORT_SYMBOL(kern_path);
3003
3004/**
3005 * vfs_path_parent_lookup - lookup a parent path relative to a dentry-vfsmount pair
3006 * @filename: filename structure
3007 * @flags: lookup flags
3008 * @parent: pointer to struct path to fill
3009 * @last: last component
3010 * @type: type of the last component
3011 * @root: pointer to struct path of the base directory
3012 */
3013int vfs_path_parent_lookup(struct filename *filename, unsigned int flags,
3014			   struct path *parent, struct qstr *last, int *type,
3015			   const struct path *root)
3016{
3017	return  __filename_parentat(AT_FDCWD, filename, flags, parent, last,
3018				    type, root);
3019}
3020EXPORT_SYMBOL(vfs_path_parent_lookup);
3021
3022/**
3023 * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair
3024 * @dentry:  pointer to dentry of the base directory
3025 * @mnt: pointer to vfs mount of the base directory
3026 * @name: pointer to file name
3027 * @flags: lookup flags
3028 * @path: pointer to struct path to fill
3029 */
3030int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
3031		    const char *name, unsigned int flags,
3032		    struct path *path)
3033{
3034	struct filename *filename;
3035	struct path root = {.mnt = mnt, .dentry = dentry};
3036	int ret;
3037
3038	filename = getname_kernel(name);
3039	/* the first argument of filename_lookup() is ignored with root */
3040	ret = filename_lookup(AT_FDCWD, filename, flags, path, &root);
3041	putname(filename);
3042	return ret;
3043}
3044EXPORT_SYMBOL(vfs_path_lookup);
3045
3046int lookup_noperm_common(struct qstr *qname, struct dentry *base)
3047{
3048	const char *name = qname->name;
3049	u32 len = qname->len;
3050
3051	qname->hash = full_name_hash(base, name, len);
3052	if (!len)
3053		return -EACCES;
3054
3055	if (is_dot_dotdot(name, len))
3056		return -EACCES;
3057
3058	while (len--) {
3059		unsigned int c = *(const unsigned char *)name++;
3060		if (c == '/' || c == '\0')
3061			return -EACCES;
3062	}
3063	/*
3064	 * See if the low-level filesystem might want
3065	 * to use its own hash..
3066	 */
3067	if (base->d_flags & DCACHE_OP_HASH) {
3068		int err = base->d_op->d_hash(base, qname);
3069		if (err < 0)
3070			return err;
3071	}
3072	return 0;
3073}
3074
3075static int lookup_one_common(struct mnt_idmap *idmap,
3076			     struct qstr *qname, struct dentry *base)
3077{
3078	int err;
3079	err = lookup_noperm_common(qname, base);
3080	if (err < 0)
3081		return err;
3082	return inode_permission(idmap, base->d_inode, MAY_EXEC);
3083}
3084
3085/**
3086 * try_lookup_noperm - filesystem helper to lookup single pathname component
3087 * @name:	qstr storing pathname component to lookup
3088 * @base:	base directory to lookup from
3089 *
3090 * Look up a dentry by name in the dcache, returning NULL if it does not
3091 * currently exist.  The function does not try to create a dentry and if one
3092 * is found it doesn't try to revalidate it.
3093 *
3094 * Note that this routine is purely a helper for filesystem usage and should
3095 * not be called by generic code.  It does no permission checking.
3096 *
3097 * No locks need be held - only a counted reference to @base is needed.
3098 *
3099 */
3100struct dentry *try_lookup_noperm(struct qstr *name, struct dentry *base)
3101{
3102	int err;
3103
3104	err = lookup_noperm_common(name, base);
3105	if (err)
3106		return ERR_PTR(err);
3107
3108	return d_lookup(base, name);
3109}
3110EXPORT_SYMBOL(try_lookup_noperm);
3111
3112/**
3113 * lookup_noperm - filesystem helper to lookup single pathname component
3114 * @name:	qstr storing pathname component to lookup
3115 * @base:	base directory to lookup from
3116 *
3117 * Note that this routine is purely a helper for filesystem usage and should
3118 * not be called by generic code.  It does no permission checking.
3119 *
3120 * The caller must hold base->i_rwsem.
3121 */
3122struct dentry *lookup_noperm(struct qstr *name, struct dentry *base)
3123{
3124	struct dentry *dentry;
3125	int err;
3126
3127	WARN_ON_ONCE(!inode_is_locked(base->d_inode));
3128
3129	err = lookup_noperm_common(name, base);
3130	if (err)
3131		return ERR_PTR(err);
3132
3133	dentry = lookup_dcache(name, base, 0);
3134	return dentry ? dentry : __lookup_slow(name, base, 0);
3135}
3136EXPORT_SYMBOL(lookup_noperm);
3137
3138/**
3139 * lookup_one - lookup single pathname component
3140 * @idmap:	idmap of the mount the lookup is performed from
3141 * @name:	qstr holding pathname component to lookup
3142 * @base:	base directory to lookup from
3143 *
3144 * This can be used for in-kernel filesystem clients such as file servers.
3145 *
3146 * The caller must hold base->i_rwsem.
3147 */
3148struct dentry *lookup_one(struct mnt_idmap *idmap, struct qstr *name,
3149			  struct dentry *base)
3150{
3151	struct dentry *dentry;
3152	int err;
3153
3154	WARN_ON_ONCE(!inode_is_locked(base->d_inode));
3155
3156	err = lookup_one_common(idmap, name, base);
3157	if (err)
3158		return ERR_PTR(err);
3159
3160	dentry = lookup_dcache(name, base, 0);
3161	return dentry ? dentry : __lookup_slow(name, base, 0);
3162}
3163EXPORT_SYMBOL(lookup_one);
3164
3165/**
3166 * lookup_one_unlocked - lookup single pathname component
3167 * @idmap:	idmap of the mount the lookup is performed from
3168 * @name:	qstr olding pathname component to lookup
3169 * @base:	base directory to lookup from
3170 *
3171 * This can be used for in-kernel filesystem clients such as file servers.
3172 *
3173 * Unlike lookup_one, it should be called without the parent
3174 * i_rwsem held, and will take the i_rwsem itself if necessary.
3175 */
3176struct dentry *lookup_one_unlocked(struct mnt_idmap *idmap, struct qstr *name,
3177				   struct dentry *base)
3178{
3179	int err;
3180	struct dentry *ret;
3181
3182	err = lookup_one_common(idmap, name, base);
3183	if (err)
3184		return ERR_PTR(err);
3185
3186	ret = lookup_dcache(name, base, 0);
3187	if (!ret)
3188		ret = lookup_slow(name, base, 0);
3189	return ret;
3190}
3191EXPORT_SYMBOL(lookup_one_unlocked);
3192
3193/**
3194 * lookup_one_positive_killable - lookup single pathname component
3195 * @idmap:	idmap of the mount the lookup is performed from
3196 * @name:	qstr olding pathname component to lookup
3197 * @base:	base directory to lookup from
3198 *
3199 * This helper will yield ERR_PTR(-ENOENT) on negatives. The helper returns
3200 * known positive or ERR_PTR(). This is what most of the users want.
3201 *
3202 * Note that pinned negative with unlocked parent _can_ become positive at any
3203 * time, so callers of lookup_one_unlocked() need to be very careful; pinned
3204 * positives have >d_inode stable, so this one avoids such problems.
3205 *
3206 * This can be used for in-kernel filesystem clients such as file servers.
3207 *
3208 * It should be called without the parent i_rwsem held, and will take
3209 * the i_rwsem itself if necessary.  If a fatal signal is pending or
3210 * delivered, it will return %-EINTR if the lock is needed.
3211 */
3212struct dentry *lookup_one_positive_killable(struct mnt_idmap *idmap,
3213					    struct qstr *name,
3214					    struct dentry *base)
3215{
3216	int err;
3217	struct dentry *ret;
3218
3219	err = lookup_one_common(idmap, name, base);
3220	if (err)
3221		return ERR_PTR(err);
3222
3223	ret = lookup_dcache(name, base, 0);
3224	if (!ret)
3225		ret = lookup_slow_killable(name, base, 0);
3226	if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) {
3227		dput(ret);
3228		ret = ERR_PTR(-ENOENT);
3229	}
3230	return ret;
3231}
3232EXPORT_SYMBOL(lookup_one_positive_killable);
3233
3234/**
3235 * lookup_one_positive_unlocked - lookup single pathname component
3236 * @idmap:	idmap of the mount the lookup is performed from
3237 * @name:	qstr holding pathname component to lookup
3238 * @base:	base directory to lookup from
3239 *
3240 * This helper will yield ERR_PTR(-ENOENT) on negatives. The helper returns
3241 * known positive or ERR_PTR(). This is what most of the users want.
3242 *
3243 * Note that pinned negative with unlocked parent _can_ become positive at any
3244 * time, so callers of lookup_one_unlocked() need to be very careful; pinned
3245 * positives have >d_inode stable, so this one avoids such problems.
3246 *
3247 * This can be used for in-kernel filesystem clients such as file servers.
3248 *
3249 * The helper should be called without i_rwsem held.
3250 */
3251struct dentry *lookup_one_positive_unlocked(struct mnt_idmap *idmap,
3252					    struct qstr *name,
3253					    struct dentry *base)
3254{
3255	struct dentry *ret = lookup_one_unlocked(idmap, name, base);
3256
3257	if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) {
3258		dput(ret);
3259		ret = ERR_PTR(-ENOENT);
3260	}
3261	return ret;
3262}
3263EXPORT_SYMBOL(lookup_one_positive_unlocked);
3264
3265/**
3266 * lookup_noperm_unlocked - filesystem helper to lookup single pathname component
3267 * @name:	pathname component to lookup
3268 * @base:	base directory to lookup from
3269 *
3270 * Note that this routine is purely a helper for filesystem usage and should
3271 * not be called by generic code. It does no permission checking.
3272 *
3273 * Unlike lookup_noperm(), it should be called without the parent
3274 * i_rwsem held, and will take the i_rwsem itself if necessary.
3275 *
3276 * Unlike try_lookup_noperm() it *does* revalidate the dentry if it already
3277 * existed.
3278 */
3279struct dentry *lookup_noperm_unlocked(struct qstr *name, struct dentry *base)
3280{
3281	struct dentry *ret;
3282	int err;
3283
3284	err = lookup_noperm_common(name, base);
3285	if (err)
3286		return ERR_PTR(err);
3287
3288	ret = lookup_dcache(name, base, 0);
3289	if (!ret)
3290		ret = lookup_slow(name, base, 0);
3291	return ret;
3292}
3293EXPORT_SYMBOL(lookup_noperm_unlocked);
3294
3295/*
3296 * Like lookup_noperm_unlocked(), except that it yields ERR_PTR(-ENOENT)
3297 * on negatives.  Returns known positive or ERR_PTR(); that's what
3298 * most of the users want.  Note that pinned negative with unlocked parent
3299 * _can_ become positive at any time, so callers of lookup_noperm_unlocked()
3300 * need to be very careful; pinned positives have ->d_inode stable, so
3301 * this one avoids such problems.
3302 */
3303struct dentry *lookup_noperm_positive_unlocked(struct qstr *name,
3304					       struct dentry *base)
3305{
3306	struct dentry *ret;
3307
3308	ret = lookup_noperm_unlocked(name, base);
3309	if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) {
3310		dput(ret);
3311		ret = ERR_PTR(-ENOENT);
3312	}
3313	return ret;
3314}
3315EXPORT_SYMBOL(lookup_noperm_positive_unlocked);
3316
3317/**
3318 * start_creating - prepare to create a given name with permission checking
3319 * @idmap:  idmap of the mount
3320 * @parent: directory in which to prepare to create the name
3321 * @name:   the name to be created
3322 *
3323 * Locks are taken and a lookup is performed prior to creating
3324 * an object in a directory.  Permission checking (MAY_EXEC) is performed
3325 * against @idmap.
3326 *
3327 * If the name already exists, a positive dentry is returned, so
3328 * behaviour is similar to O_CREAT without O_EXCL, which doesn't fail
3329 * with -EEXIST.
3330 *
3331 * Returns: a negative or positive dentry, or an error.
3332 */
3333struct dentry *start_creating(struct mnt_idmap *idmap, struct dentry *parent,
3334			      struct qstr *name)
3335{
3336	int err = lookup_one_common(idmap, name, parent);
3337
3338	if (err)
3339		return ERR_PTR(err);
3340	return start_dirop(parent, name, LOOKUP_CREATE);
3341}
3342EXPORT_SYMBOL(start_creating);
3343
3344/**
3345 * start_removing - prepare to remove a given name with permission checking
3346 * @idmap:  idmap of the mount
3347 * @parent: directory in which to find the name
3348 * @name:   the name to be removed
3349 *
3350 * Locks are taken and a lookup in performed prior to removing
3351 * an object from a directory.  Permission checking (MAY_EXEC) is performed
3352 * against @idmap.
3353 *
3354 * If the name doesn't exist, an error is returned.
3355 *
3356 * end_removing() should be called when removal is complete, or aborted.
3357 *
3358 * Returns: a positive dentry, or an error.
3359 */
3360struct dentry *start_removing(struct mnt_idmap *idmap, struct dentry *parent,
3361			      struct qstr *name)
3362{
3363	int err = lookup_one_common(idmap, name, parent);
3364
3365	if (err)
3366		return ERR_PTR(err);
3367	return start_dirop(parent, name, 0);
3368}
3369EXPORT_SYMBOL(start_removing);
3370
3371/**
3372 * start_creating_killable - prepare to create a given name with permission checking
3373 * @idmap:  idmap of the mount
3374 * @parent: directory in which to prepare to create the name
3375 * @name:   the name to be created
3376 *
3377 * Locks are taken and a lookup in performed prior to creating
3378 * an object in a directory.  Permission checking (MAY_EXEC) is performed
3379 * against @idmap.
3380 *
3381 * If the name already exists, a positive dentry is returned.
3382 *
3383 * If a signal is received or was already pending, the function aborts
3384 * with -EINTR;
3385 *
3386 * Returns: a negative or positive dentry, or an error.
3387 */
3388struct dentry *start_creating_killable(struct mnt_idmap *idmap,
3389				       struct dentry *parent,
3390				       struct qstr *name)
3391{
3392	int err = lookup_one_common(idmap, name, parent);
3393
3394	if (err)
3395		return ERR_PTR(err);
3396	return __start_dirop(parent, name, LOOKUP_CREATE, TASK_KILLABLE);
3397}
3398EXPORT_SYMBOL(start_creating_killable);
3399
3400/**
3401 * start_removing_killable - prepare to remove a given name with permission checking
3402 * @idmap:  idmap of the mount
3403 * @parent: directory in which to find the name
3404 * @name:   the name to be removed
3405 *
3406 * Locks are taken and a lookup in performed prior to removing
3407 * an object from a directory.  Permission checking (MAY_EXEC) is performed
3408 * against @idmap.
3409 *
3410 * If the name doesn't exist, an error is returned.
3411 *
3412 * end_removing() should be called when removal is complete, or aborted.
3413 *
3414 * If a signal is received or was already pending, the function aborts
3415 * with -EINTR;
3416 *
3417 * Returns: a positive dentry, or an error.
3418 */
3419struct dentry *start_removing_killable(struct mnt_idmap *idmap,
3420				       struct dentry *parent,
3421				       struct qstr *name)
3422{
3423	int err = lookup_one_common(idmap, name, parent);
3424
3425	if (err)
3426		return ERR_PTR(err);
3427	return __start_dirop(parent, name, 0, TASK_KILLABLE);
3428}
3429EXPORT_SYMBOL(start_removing_killable);
3430
3431/**
3432 * start_creating_noperm - prepare to create a given name without permission checking
3433 * @parent: directory in which to prepare to create the name
3434 * @name:   the name to be created
3435 *
3436 * Locks are taken and a lookup in performed prior to creating
3437 * an object in a directory.
3438 *
3439 * If the name already exists, a positive dentry is returned.
3440 *
3441 * Returns: a negative or positive dentry, or an error.
3442 */
3443struct dentry *start_creating_noperm(struct dentry *parent,
3444				     struct qstr *name)
3445{
3446	int err = lookup_noperm_common(name, parent);
3447
3448	if (err)
3449		return ERR_PTR(err);
3450	return start_dirop(parent, name, LOOKUP_CREATE);
3451}
3452EXPORT_SYMBOL(start_creating_noperm);
3453
3454/**
3455 * start_removing_noperm - prepare to remove a given name without permission checking
3456 * @parent: directory in which to find the name
3457 * @name:   the name to be removed
3458 *
3459 * Locks are taken and a lookup in performed prior to removing
3460 * an object from a directory.
3461 *
3462 * If the name doesn't exist, an error is returned.
3463 *
3464 * end_removing() should be called when removal is complete, or aborted.
3465 *
3466 * Returns: a positive dentry, or an error.
3467 */
3468struct dentry *start_removing_noperm(struct dentry *parent,
3469				     struct qstr *name)
3470{
3471	int err = lookup_noperm_common(name, parent);
3472
3473	if (err)
3474		return ERR_PTR(err);
3475	return start_dirop(parent, name, 0);
3476}
3477EXPORT_SYMBOL(start_removing_noperm);
3478
3479/**
3480 * start_creating_dentry - prepare to create a given dentry
3481 * @parent: directory from which dentry should be removed
3482 * @child:  the dentry to be removed
3483 *
3484 * A lock is taken to protect the dentry again other dirops and
3485 * the validity of the dentry is checked: correct parent and still hashed.
3486 *
3487 * If the dentry is valid and negative a reference is taken and
3488 * returned.  If not an error is returned.
3489 *
3490 * end_creating() should be called when creation is complete, or aborted.
3491 *
3492 * Returns: the valid dentry, or an error.
3493 */
3494struct dentry *start_creating_dentry(struct dentry *parent,
3495				     struct dentry *child)
3496{
3497	inode_lock_nested(parent->d_inode, I_MUTEX_PARENT);
3498	if (unlikely(IS_DEADDIR(parent->d_inode) ||
3499		     child->d_parent != parent ||
3500		     d_unhashed(child))) {
3501		inode_unlock(parent->d_inode);
3502		return ERR_PTR(-EINVAL);
3503	}
3504	if (d_is_positive(child)) {
3505		inode_unlock(parent->d_inode);
3506		return ERR_PTR(-EEXIST);
3507	}
3508	return dget(child);
3509}
3510EXPORT_SYMBOL(start_creating_dentry);
3511
3512/**
3513 * start_removing_dentry - prepare to remove a given dentry
3514 * @parent: directory from which dentry should be removed
3515 * @child:  the dentry to be removed
3516 *
3517 * A lock is taken to protect the dentry again other dirops and
3518 * the validity of the dentry is checked: correct parent and still hashed.
3519 *
3520 * If the dentry is valid and positive, a reference is taken and
3521 * returned.  If not an error is returned.
3522 *
3523 * end_removing() should be called when removal is complete, or aborted.
3524 *
3525 * Returns: the valid dentry, or an error.
3526 */
3527struct dentry *start_removing_dentry(struct dentry *parent,
3528				     struct dentry *child)
3529{
3530	inode_lock_nested(parent->d_inode, I_MUTEX_PARENT);
3531	if (unlikely(IS_DEADDIR(parent->d_inode) ||
3532		     child->d_parent != parent ||
3533		     d_unhashed(child))) {
3534		inode_unlock(parent->d_inode);
3535		return ERR_PTR(-EINVAL);
3536	}
3537	if (d_is_negative(child)) {
3538		inode_unlock(parent->d_inode);
3539		return ERR_PTR(-ENOENT);
3540	}
3541	return dget(child);
3542}
3543EXPORT_SYMBOL(start_removing_dentry);
3544
3545#ifdef CONFIG_UNIX98_PTYS
3546int path_pts(struct path *path)
3547{
3548	/* Find something mounted on "pts" in the same directory as
3549	 * the input path.
3550	 */
3551	struct dentry *parent = dget_parent(path->dentry);
3552	struct dentry *child;
3553	struct qstr this = QSTR_INIT("pts", 3);
3554
3555	if (unlikely(!path_connected(path->mnt, parent))) {
3556		dput(parent);
3557		return -ENOENT;
3558	}
3559	dput(path->dentry);
3560	path->dentry = parent;
3561	child = d_hash_and_lookup(parent, &this);
3562	if (IS_ERR_OR_NULL(child))
3563		return -ENOENT;
3564
3565	path->dentry = child;
3566	dput(parent);
3567	follow_down(path, 0);
3568	return 0;
3569}
3570#endif
3571
3572int user_path_at(int dfd, const char __user *name, unsigned flags,
3573		 struct path *path)
3574{
3575	struct filename *filename = getname_flags(name, flags);
3576	int ret = filename_lookup(dfd, filename, flags, path, NULL);
3577
3578	putname(filename);
3579	return ret;
3580}
3581EXPORT_SYMBOL(user_path_at);
3582
3583int __check_sticky(struct mnt_idmap *idmap, struct inode *dir,
3584		   struct inode *inode)
3585{
3586	kuid_t fsuid = current_fsuid();
3587
3588	if (vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, inode), fsuid))
3589		return 0;
3590	if (vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, dir), fsuid))
3591		return 0;
3592	return !capable_wrt_inode_uidgid(idmap, inode, CAP_FOWNER);
3593}
3594EXPORT_SYMBOL(__check_sticky);
3595
3596/*
3597 *	Check whether we can remove a link victim from directory dir, check
3598 *  whether the type of victim is right.
3599 *  1. We can't do it if dir is read-only (done in permission())
3600 *  2. We should have write and exec permissions on dir
3601 *  3. We can't remove anything from append-only dir
3602 *  4. We can't do anything with immutable dir (done in permission())
3603 *  5. If the sticky bit on dir is set we should either
3604 *	a. be owner of dir, or
3605 *	b. be owner of victim, or
3606 *	c. have CAP_FOWNER capability
3607 *  6. If the victim is append-only or immutable we can't do antyhing with
3608 *     links pointing to it.
3609 *  7. If the victim has an unknown uid or gid we can't change the inode.
3610 *  8. If we were asked to remove a directory and victim isn't one - ENOTDIR.
3611 *  9. If we were asked to remove a non-directory and victim isn't one - EISDIR.
3612 * 10. We can't remove a root or mountpoint.
3613 * 11. We don't allow removal of NFS sillyrenamed files; it's handled by
3614 *     nfs_async_unlink().
3615 */
3616static int may_delete(struct mnt_idmap *idmap, struct inode *dir,
3617		      struct dentry *victim, bool isdir)
3618{
3619	struct inode *inode = d_backing_inode(victim);
3620	int error;
3621
3622	if (d_is_negative(victim))
3623		return -ENOENT;
3624	BUG_ON(!inode);
3625
3626	BUG_ON(victim->d_parent->d_inode != dir);
3627
3628	/* Inode writeback is not safe when the uid or gid are invalid. */
3629	if (!vfsuid_valid(i_uid_into_vfsuid(idmap, inode)) ||
3630	    !vfsgid_valid(i_gid_into_vfsgid(idmap, inode)))
3631		return -EOVERFLOW;
3632
3633	audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
3634
3635	error = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC);
3636	if (error)
3637		return error;
3638	if (IS_APPEND(dir))
3639		return -EPERM;
3640
3641	if (check_sticky(idmap, dir, inode) || IS_APPEND(inode) ||
3642	    IS_IMMUTABLE(inode) || IS_SWAPFILE(inode) ||
3643	    HAS_UNMAPPED_ID(idmap, inode))
3644		return -EPERM;
3645	if (isdir) {
3646		if (!d_is_dir(victim))
3647			return -ENOTDIR;
3648		if (IS_ROOT(victim))
3649			return -EBUSY;
3650	} else if (d_is_dir(victim))
3651		return -EISDIR;
3652	if (IS_DEADDIR(dir))
3653		return -ENOENT;
3654	if (victim->d_flags & DCACHE_NFSFS_RENAMED)
3655		return -EBUSY;
3656	return 0;
3657}
3658
3659/*	Check whether we can create an object with dentry child in directory
3660 *  dir.
3661 *  1. We can't do it if child already exists (open has special treatment for
3662 *     this case, but since we are inlined it's OK)
3663 *  2. We can't do it if dir is read-only (done in permission())
3664 *  3. We can't do it if the fs can't represent the fsuid or fsgid.
3665 *  4. We should have write and exec permissions on dir
3666 *  5. We can't do it if dir is immutable (done in permission())
3667 */
3668static inline int may_create(struct mnt_idmap *idmap,
3669			     struct inode *dir, struct dentry *child)
3670{
3671	audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE);
3672	if (child->d_inode)
3673		return -EEXIST;
3674	if (IS_DEADDIR(dir))
3675		return -ENOENT;
3676	if (!fsuidgid_has_mapping(dir->i_sb, idmap))
3677		return -EOVERFLOW;
3678
3679	return inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC);
3680}
3681
3682// p1 != p2, both are on the same filesystem, ->s_vfs_rename_mutex is held
3683static struct dentry *lock_two_directories(struct dentry *p1, struct dentry *p2)
3684{
3685	struct dentry *p = p1, *q = p2, *r;
3686
3687	while ((r = p->d_parent) != p2 && r != p)
3688		p = r;
3689	if (r == p2) {
3690		// p is a child of p2 and an ancestor of p1 or p1 itself
3691		inode_lock_nested(p2->d_inode, I_MUTEX_PARENT);
3692		inode_lock_nested(p1->d_inode, I_MUTEX_PARENT2);
3693		return p;
3694	}
3695	// p is the root of connected component that contains p1
3696	// p2 does not occur on the path from p to p1
3697	while ((r = q->d_parent) != p1 && r != p && r != q)
3698		q = r;
3699	if (r == p1) {
3700		// q is a child of p1 and an ancestor of p2 or p2 itself
3701		inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
3702		inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2);
3703		return q;
3704	} else if (likely(r == p)) {
3705		// both p2 and p1 are descendents of p
3706		inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
3707		inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2);
3708		return NULL;
3709	} else { // no common ancestor at the time we'd been called
3710		mutex_unlock(&p1->d_sb->s_vfs_rename_mutex);
3711		return ERR_PTR(-EXDEV);
3712	}
3713}
3714
3715/*
3716 * p1 and p2 should be directories on the same fs.
3717 */
3718struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
3719{
3720	if (p1 == p2) {
3721		inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
3722		return NULL;
3723	}
3724
3725	mutex_lock(&p1->d_sb->s_vfs_rename_mutex);
3726	return lock_two_directories(p1, p2);
3727}
3728EXPORT_SYMBOL(lock_rename);
3729
3730/*
3731 * c1 and p2 should be on the same fs.
3732 */
3733struct dentry *lock_rename_child(struct dentry *c1, struct dentry *p2)
3734{
3735	if (READ_ONCE(c1->d_parent) == p2) {
3736		/*
3737		 * hopefully won't need to touch ->s_vfs_rename_mutex at all.
3738		 */
3739		inode_lock_nested(p2->d_inode, I_MUTEX_PARENT);
3740		/*
3741		 * now that p2 is locked, nobody can move in or out of it,
3742		 * so the test below is safe.
3743		 */
3744		if (likely(c1->d_parent == p2))
3745			return NULL;
3746
3747		/*
3748		 * c1 got moved out of p2 while we'd been taking locks;
3749		 * unlock and fall back to slow case.
3750		 */
3751		inode_unlock(p2->d_inode);
3752	}
3753
3754	mutex_lock(&c1->d_sb->s_vfs_rename_mutex);
3755	/*
3756	 * nobody can move out of any directories on this fs.
3757	 */
3758	if (likely(c1->d_parent != p2))
3759		return lock_two_directories(c1->d_parent, p2);
3760
3761	/*
3762	 * c1 got moved into p2 while we were taking locks;
3763	 * we need p2 locked and ->s_vfs_rename_mutex unlocked,
3764	 * for consistency with lock_rename().
3765	 */
3766	inode_lock_nested(p2->d_inode, I_MUTEX_PARENT);
3767	mutex_unlock(&c1->d_sb->s_vfs_rename_mutex);
3768	return NULL;
3769}
3770EXPORT_SYMBOL(lock_rename_child);
3771
3772void unlock_rename(struct dentry *p1, struct dentry *p2)
3773{
3774	inode_unlock(p1->d_inode);
3775	if (p1 != p2) {
3776		inode_unlock(p2->d_inode);
3777		mutex_unlock(&p1->d_sb->s_vfs_rename_mutex);
3778	}
3779}
3780EXPORT_SYMBOL(unlock_rename);
3781
3782/**
3783 * __start_renaming - lookup and lock names for rename
3784 * @rd:           rename data containing parents and flags, and
3785 *                for receiving found dentries
3786 * @lookup_flags: extra flags to pass to ->lookup (e.g. LOOKUP_REVAL,
3787 *                LOOKUP_NO_SYMLINKS etc).
3788 * @old_last:     name of object in @rd.old_parent
3789 * @new_last:     name of object in @rd.new_parent
3790 *
3791 * Look up two names and ensure locks are in place for
3792 * rename.
3793 *
3794 * On success the found dentries are stored in @rd.old_dentry,
3795 * @rd.new_dentry and an extra ref is taken on @rd.old_parent.
3796 * These references and the lock are dropped by end_renaming().
3797 *
3798 * The passed in qstrs must have the hash calculated, and no permission
3799 * checking is performed.
3800 *
3801 * Returns: zero or an error.
3802 */
3803static int
3804__start_renaming(struct renamedata *rd, int lookup_flags,
3805		 struct qstr *old_last, struct qstr *new_last)
3806{
3807	struct dentry *trap;
3808	struct dentry *d1, *d2;
3809	int target_flags = LOOKUP_RENAME_TARGET | LOOKUP_CREATE;
3810	int err;
3811
3812	if (rd->flags & RENAME_EXCHANGE)
3813		target_flags = 0;
3814	if (rd->flags & RENAME_NOREPLACE)
3815		target_flags |= LOOKUP_EXCL;
3816
3817	trap = lock_rename(rd->old_parent, rd->new_parent);
3818	if (IS_ERR(trap))
3819		return PTR_ERR(trap);
3820
3821	d1 = lookup_one_qstr_excl(old_last, rd->old_parent,
3822				  lookup_flags);
3823	err = PTR_ERR(d1);
3824	if (IS_ERR(d1))
3825		goto out_unlock;
3826
3827	d2 = lookup_one_qstr_excl(new_last, rd->new_parent,
3828				  lookup_flags | target_flags);
3829	err = PTR_ERR(d2);
3830	if (IS_ERR(d2))
3831		goto out_dput_d1;
3832
3833	if (d1 == trap) {
3834		/* source is an ancestor of target */
3835		err = -EINVAL;
3836		goto out_dput_d2;
3837	}
3838
3839	if (d2 == trap) {
3840		/* target is an ancestor of source */
3841		if (rd->flags & RENAME_EXCHANGE)
3842			err = -EINVAL;
3843		else
3844			err = -ENOTEMPTY;
3845		goto out_dput_d2;
3846	}
3847
3848	rd->old_dentry = d1;
3849	rd->new_dentry = d2;
3850	dget(rd->old_parent);
3851	return 0;
3852
3853out_dput_d2:
3854	dput(d2);
3855out_dput_d1:
3856	dput(d1);
3857out_unlock:
3858	unlock_rename(rd->old_parent, rd->new_parent);
3859	return err;
3860}
3861
3862/**
3863 * start_renaming - lookup and lock names for rename with permission checking
3864 * @rd:           rename data containing parents and flags, and
3865 *                for receiving found dentries
3866 * @lookup_flags: extra flags to pass to ->lookup (e.g. LOOKUP_REVAL,
3867 *                LOOKUP_NO_SYMLINKS etc).
3868 * @old_last:     name of object in @rd.old_parent
3869 * @new_last:     name of object in @rd.new_parent
3870 *
3871 * Look up two names and ensure locks are in place for
3872 * rename.
3873 *
3874 * On success the found dentries are stored in @rd.old_dentry,
3875 * @rd.new_dentry.  Also the refcount on @rd->old_parent is increased.
3876 * These references and the lock are dropped by end_renaming().
3877 *
3878 * The passed in qstrs need not have the hash calculated, and basic
3879 * eXecute permission checking is performed against @rd.mnt_idmap.
3880 *
3881 * Returns: zero or an error.
3882 */
3883int start_renaming(struct renamedata *rd, int lookup_flags,
3884		   struct qstr *old_last, struct qstr *new_last)
3885{
3886	int err;
3887
3888	err = lookup_one_common(rd->mnt_idmap, old_last, rd->old_parent);
3889	if (err)
3890		return err;
3891	err = lookup_one_common(rd->mnt_idmap, new_last, rd->new_parent);
3892	if (err)
3893		return err;
3894	return __start_renaming(rd, lookup_flags, old_last, new_last);
3895}
3896EXPORT_SYMBOL(start_renaming);
3897
3898static int
3899__start_renaming_dentry(struct renamedata *rd, int lookup_flags,
3900			struct dentry *old_dentry, struct qstr *new_last)
3901{
3902	struct dentry *trap;
3903	struct dentry *d2;
3904	int target_flags = LOOKUP_RENAME_TARGET | LOOKUP_CREATE;
3905	int err;
3906
3907	if (rd->flags & RENAME_EXCHANGE)
3908		target_flags = 0;
3909	if (rd->flags & RENAME_NOREPLACE)
3910		target_flags |= LOOKUP_EXCL;
3911
3912	/* Already have the dentry - need to be sure to lock the correct parent */
3913	trap = lock_rename_child(old_dentry, rd->new_parent);
3914	if (IS_ERR(trap))
3915		return PTR_ERR(trap);
3916	if (d_unhashed(old_dentry) ||
3917	    (rd->old_parent && rd->old_parent != old_dentry->d_parent)) {
3918		/* dentry was removed, or moved and explicit parent requested */
3919		err = -EINVAL;
3920		goto out_unlock;
3921	}
3922
3923	d2 = lookup_one_qstr_excl(new_last, rd->new_parent,
3924				  lookup_flags | target_flags);
3925	err = PTR_ERR(d2);
3926	if (IS_ERR(d2))
3927		goto out_unlock;
3928
3929	if (old_dentry == trap) {
3930		/* source is an ancestor of target */
3931		err = -EINVAL;
3932		goto out_dput_d2;
3933	}
3934
3935	if (d2 == trap) {
3936		/* target is an ancestor of source */
3937		if (rd->flags & RENAME_EXCHANGE)
3938			err = -EINVAL;
3939		else
3940			err = -ENOTEMPTY;
3941		goto out_dput_d2;
3942	}
3943
3944	rd->old_dentry = dget(old_dentry);
3945	rd->new_dentry = d2;
3946	rd->old_parent = dget(old_dentry->d_parent);
3947	return 0;
3948
3949out_dput_d2:
3950	dput(d2);
3951out_unlock:
3952	unlock_rename(old_dentry->d_parent, rd->new_parent);
3953	return err;
3954}
3955
3956/**
3957 * start_renaming_dentry - lookup and lock name for rename with permission checking
3958 * @rd:           rename data containing parents and flags, and
3959 *                for receiving found dentries
3960 * @lookup_flags: extra flags to pass to ->lookup (e.g. LOOKUP_REVAL,
3961 *                LOOKUP_NO_SYMLINKS etc).
3962 * @old_dentry:   dentry of name to move
3963 * @new_last:     name of target in @rd.new_parent
3964 *
3965 * Look up target name and ensure locks are in place for
3966 * rename.
3967 *
3968 * On success the found dentry is stored in @rd.new_dentry and
3969 * @rd.old_parent is confirmed to be the parent of @old_dentry.  If it
3970 * was originally %NULL, it is set.  In either case a reference is taken
3971 * so that end_renaming() can have a stable reference to unlock.
3972 *
3973 * References and the lock can be dropped with end_renaming()
3974 *
3975 * The passed in qstr need not have the hash calculated, and basic
3976 * eXecute permission checking is performed against @rd.mnt_idmap.
3977 *
3978 * Returns: zero or an error.
3979 */
3980int start_renaming_dentry(struct renamedata *rd, int lookup_flags,
3981			  struct dentry *old_dentry, struct qstr *new_last)
3982{
3983	int err;
3984
3985	err = lookup_one_common(rd->mnt_idmap, new_last, rd->new_parent);
3986	if (err)
3987		return err;
3988	return __start_renaming_dentry(rd, lookup_flags, old_dentry, new_last);
3989}
3990EXPORT_SYMBOL(start_renaming_dentry);
3991
3992/**
3993 * start_renaming_two_dentries - Lock to dentries in given parents for rename
3994 * @rd:           rename data containing parent
3995 * @old_dentry:   dentry of name to move
3996 * @new_dentry:   dentry to move to
3997 *
3998 * Ensure locks are in place for rename and check parentage is still correct.
3999 *
4000 * On success the two dentries are stored in @rd.old_dentry and
4001 * @rd.new_dentry and @rd.old_parent and @rd.new_parent are confirmed to
4002 * be the parents of the dentries.
4003 *
4004 * References and the lock can be dropped with end_renaming()
4005 *
4006 * Returns: zero or an error.
4007 */
4008int
4009start_renaming_two_dentries(struct renamedata *rd,
4010			    struct dentry *old_dentry, struct dentry *new_dentry)
4011{
4012	struct dentry *trap;
4013	int err;
4014
4015	/* Already have the dentry - need to be sure to lock the correct parent */
4016	trap = lock_rename_child(old_dentry, rd->new_parent);
4017	if (IS_ERR(trap))
4018		return PTR_ERR(trap);
4019	err = -EINVAL;
4020	if (d_unhashed(old_dentry) ||
4021	    (rd->old_parent && rd->old_parent != old_dentry->d_parent))
4022		/* old_dentry was removed, or moved and explicit parent requested */
4023		goto out_unlock;
4024	if (d_unhashed(new_dentry) ||
4025	    rd->new_parent != new_dentry->d_parent)
4026		/* new_dentry was removed or moved */
4027		goto out_unlock;
4028
4029	if (old_dentry == trap)
4030		/* source is an ancestor of target */
4031		goto out_unlock;
4032
4033	if (new_dentry == trap) {
4034		/* target is an ancestor of source */
4035		if (rd->flags & RENAME_EXCHANGE)
4036			err = -EINVAL;
4037		else
4038			err = -ENOTEMPTY;
4039		goto out_unlock;
4040	}
4041
4042	err = -EEXIST;
4043	if (d_is_positive(new_dentry) && (rd->flags & RENAME_NOREPLACE))
4044		goto out_unlock;
4045
4046	rd->old_dentry = dget(old_dentry);
4047	rd->new_dentry = dget(new_dentry);
4048	rd->old_parent = dget(old_dentry->d_parent);
4049	return 0;
4050
4051out_unlock:
4052	unlock_rename(old_dentry->d_parent, rd->new_parent);
4053	return err;
4054}
4055EXPORT_SYMBOL(start_renaming_two_dentries);
4056
4057void end_renaming(struct renamedata *rd)
4058{
4059	unlock_rename(rd->old_parent, rd->new_parent);
4060	dput(rd->old_dentry);
4061	dput(rd->new_dentry);
4062	dput(rd->old_parent);
4063}
4064EXPORT_SYMBOL(end_renaming);
4065
4066/**
4067 * vfs_prepare_mode - prepare the mode to be used for a new inode
4068 * @idmap:	idmap of the mount the inode was found from
4069 * @dir:	parent directory of the new inode
4070 * @mode:	mode of the new inode
4071 * @mask_perms:	allowed permission by the vfs
4072 * @type:	type of file to be created
4073 *
4074 * This helper consolidates and enforces vfs restrictions on the @mode of a new
4075 * object to be created.
4076 *
4077 * Umask stripping depends on whether the filesystem supports POSIX ACLs (see
4078 * the kernel documentation for mode_strip_umask()). Moving umask stripping
4079 * after setgid stripping allows the same ordering for both non-POSIX ACL and
4080 * POSIX ACL supporting filesystems.
4081 *
4082 * Note that it's currently valid for @type to be 0 if a directory is created.
4083 * Filesystems raise that flag individually and we need to check whether each
4084 * filesystem can deal with receiving S_IFDIR from the vfs before we enforce a
4085 * non-zero type.
4086 *
4087 * Returns: mode to be passed to the filesystem
4088 */
4089static inline umode_t vfs_prepare_mode(struct mnt_idmap *idmap,
4090				       const struct inode *dir, umode_t mode,
4091				       umode_t mask_perms, umode_t type)
4092{
4093	mode = mode_strip_sgid(idmap, dir, mode);
4094	mode = mode_strip_umask(dir, mode);
4095
4096	/*
4097	 * Apply the vfs mandated allowed permission mask and set the type of
4098	 * file to be created before we call into the filesystem.
4099	 */
4100	mode &= (mask_perms & ~S_IFMT);
4101	mode |= (type & S_IFMT);
4102
4103	return mode;
4104}
4105
4106/**
4107 * vfs_create - create new file
4108 * @idmap:	idmap of the mount the inode was found from
4109 * @dentry:	dentry of the child file
4110 * @mode:	mode of the child file
4111 * @di:		returns parent inode, if the inode is delegated.
4112 *
4113 * Create a new file.
4114 *
4115 * If the inode has been found through an idmapped mount the idmap of
4116 * the vfsmount must be passed through @idmap. This function will then take
4117 * care to map the inode according to @idmap before checking permissions.
4118 * On non-idmapped mounts or if permission checking is to be performed on the
4119 * raw inode simply pass @nop_mnt_idmap.
4120 */
4121int vfs_create(struct mnt_idmap *idmap, struct dentry *dentry, umode_t mode,
4122	       struct delegated_inode *di)
4123{
4124	struct inode *dir = d_inode(dentry->d_parent);
4125	int error;
4126
4127	error = may_create(idmap, dir, dentry);
4128	if (error)
4129		return error;
4130
4131	if (!dir->i_op->create)
4132		return -EACCES;	/* shouldn't it be ENOSYS? */
4133
4134	mode = vfs_prepare_mode(idmap, dir, mode, S_IALLUGO, S_IFREG);
4135	error = security_inode_create(dir, dentry, mode);
4136	if (error)
4137		return error;
4138	error = try_break_deleg(dir, di);
4139	if (error)
4140		return error;
4141	error = dir->i_op->create(idmap, dir, dentry, mode, true);
4142	if (!error)
4143		fsnotify_create(dir, dentry);
4144	return error;
4145}
4146EXPORT_SYMBOL(vfs_create);
4147
4148int vfs_mkobj(struct dentry *dentry, umode_t mode,
4149		int (*f)(struct dentry *, umode_t, void *),
4150		void *arg)
4151{
4152	struct inode *dir = dentry->d_parent->d_inode;
4153	int error = may_create(&nop_mnt_idmap, dir, dentry);
4154	if (error)
4155		return error;
4156
4157	mode &= S_IALLUGO;
4158	mode |= S_IFREG;
4159	error = security_inode_create(dir, dentry, mode);
4160	if (error)
4161		return error;
4162	error = f(dentry, mode, arg);
4163	if (!error)
4164		fsnotify_create(dir, dentry);
4165	return error;
4166}
4167EXPORT_SYMBOL(vfs_mkobj);
4168
4169bool may_open_dev(const struct path *path)
4170{
4171	return !(path->mnt->mnt_flags & MNT_NODEV) &&
4172		!(path->mnt->mnt_sb->s_iflags & SB_I_NODEV);
4173}
4174
4175static int may_open(struct mnt_idmap *idmap, const struct path *path,
4176		    int acc_mode, int flag)
4177{
4178	struct dentry *dentry = path->dentry;
4179	struct inode *inode = dentry->d_inode;
4180	int error;
4181
4182	if (!inode)
4183		return -ENOENT;
4184
4185	switch (inode->i_mode & S_IFMT) {
4186	case S_IFLNK:
4187		return -ELOOP;
4188	case S_IFDIR:
4189		if (acc_mode & MAY_WRITE)
4190			return -EISDIR;
4191		if (acc_mode & MAY_EXEC)
4192			return -EACCES;
4193		break;
4194	case S_IFBLK:
4195	case S_IFCHR:
4196		if (!may_open_dev(path))
4197			return -EACCES;
4198		fallthrough;
4199	case S_IFIFO:
4200	case S_IFSOCK:
4201		if (acc_mode & MAY_EXEC)
4202			return -EACCES;
4203		flag &= ~O_TRUNC;
4204		break;
4205	case S_IFREG:
4206		if ((acc_mode & MAY_EXEC) && path_noexec(path))
4207			return -EACCES;
4208		break;
4209	default:
4210		VFS_BUG_ON_INODE(!IS_ANON_FILE(inode), inode);
4211	}
4212
4213	error = inode_permission(idmap, inode, MAY_OPEN | acc_mode);
4214	if (error)
4215		return error;
4216
4217	/*
4218	 * An append-only file must be opened in append mode for writing.
4219	 */
4220	if (IS_APPEND(inode)) {
4221		if  ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND))
4222			return -EPERM;
4223		if (flag & O_TRUNC)
4224			return -EPERM;
4225	}
4226
4227	/* O_NOATIME can only be set by the owner or superuser */
4228	if (flag & O_NOATIME && !inode_owner_or_capable(idmap, inode))
4229		return -EPERM;
4230
4231	return 0;
4232}
4233
4234static int handle_truncate(struct mnt_idmap *idmap, struct file *filp)
4235{
4236	const struct path *path = &filp->f_path;
4237	struct inode *inode = path->dentry->d_inode;
4238	int error = get_write_access(inode);
4239	if (error)
4240		return error;
4241
4242	error = security_file_truncate(filp);
4243	if (!error) {
4244		error = do_truncate(idmap, path->dentry, 0,
4245				    ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
4246				    filp);
4247	}
4248	put_write_access(inode);
4249	return error;
4250}
4251
4252static inline int open_to_namei_flags(int flag)
4253{
4254	if ((flag & O_ACCMODE) == 3)
4255		flag--;
4256	return flag;
4257}
4258
4259static int may_o_create(struct mnt_idmap *idmap,
4260			const struct path *dir, struct dentry *dentry,
4261			umode_t mode)
4262{
4263	int error = security_path_mknod(dir, dentry, mode, 0);
4264	if (error)
4265		return error;
4266
4267	if (!fsuidgid_has_mapping(dir->dentry->d_sb, idmap))
4268		return -EOVERFLOW;
4269
4270	error = inode_permission(idmap, dir->dentry->d_inode,
4271				 MAY_WRITE | MAY_EXEC);
4272	if (error)
4273		return error;
4274
4275	return security_inode_create(dir->dentry->d_inode, dentry, mode);
4276}
4277
4278/*
4279 * Attempt to atomically look up, create and open a file from a negative
4280 * dentry.
4281 *
4282 * Returns 0 if successful.  The file will have been created and attached to
4283 * @file by the filesystem calling finish_open().
4284 *
4285 * If the file was looked up only or didn't need creating, FMODE_OPENED won't
4286 * be set.  The caller will need to perform the open themselves.  @path will
4287 * have been updated to point to the new dentry.  This may be negative.
4288 *
4289 * Returns an error code otherwise.
4290 */
4291static struct dentry *atomic_open(struct nameidata *nd, struct dentry *dentry,
4292				  struct file *file,
4293				  int open_flag, umode_t mode)
4294{
4295	struct dentry *const DENTRY_NOT_SET = (void *) -1UL;
4296	struct inode *dir =  nd->path.dentry->d_inode;
4297	int error;
4298
4299	if (nd->flags & LOOKUP_DIRECTORY)
4300		open_flag |= O_DIRECTORY;
4301
4302	file->__f_path.dentry = DENTRY_NOT_SET;
4303	file->__f_path.mnt = nd->path.mnt;
4304	error = dir->i_op->atomic_open(dir, dentry, file,
4305				       open_to_namei_flags(open_flag), mode);
4306	d_lookup_done(dentry);
4307	if (!error) {
4308		if (file->f_mode & FMODE_OPENED) {
4309			if (unlikely(dentry != file->f_path.dentry)) {
4310				dput(dentry);
4311				dentry = dget(file->f_path.dentry);
4312			}
4313		} else if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) {
4314			error = -EIO;
4315		} else {
4316			if (file->f_path.dentry) {
4317				dput(dentry);
4318				dentry = file->f_path.dentry;
4319			}
4320			if (unlikely(d_is_negative(dentry)))
4321				error = -ENOENT;
4322		}
4323	}
4324	if (error) {
4325		dput(dentry);
4326		dentry = ERR_PTR(error);
4327	}
4328	return dentry;
4329}
4330
4331/*
4332 * Look up and maybe create and open the last component.
4333 *
4334 * Must be called with parent locked (exclusive in O_CREAT case).
4335 *
4336 * Returns 0 on success, that is, if
4337 *  the file was successfully atomically created (if necessary) and opened, or
4338 *  the file was not completely opened at this time, though lookups and
4339 *  creations were performed.
4340 * These case are distinguished by presence of FMODE_OPENED on file->f_mode.
4341 * In the latter case dentry returned in @path might be negative if O_CREAT
4342 * hadn't been specified.
4343 *
4344 * An error code is returned on failure.
4345 */
4346static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
4347				  const struct open_flags *op,
4348				  bool got_write, struct delegated_inode *delegated_inode)
4349{
4350	struct mnt_idmap *idmap;
4351	struct dentry *dir = nd->path.dentry;
4352	struct inode *dir_inode = dir->d_inode;
4353	int open_flag = op->open_flag;
4354	struct dentry *dentry;
4355	int error, create_error = 0;
4356	umode_t mode = op->mode;
4357	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
4358
4359	if (unlikely(IS_DEADDIR(dir_inode)))
4360		return ERR_PTR(-ENOENT);
4361
4362	file->f_mode &= ~FMODE_CREATED;
4363	dentry = d_lookup(dir, &nd->last);
4364	for (;;) {
4365		if (!dentry) {
4366			dentry = d_alloc_parallel(dir, &nd->last, &wq);
4367			if (IS_ERR(dentry))
4368				return dentry;
4369		}
4370		if (d_in_lookup(dentry))
4371			break;
4372
4373		error = d_revalidate(dir_inode, &nd->last, dentry, nd->flags);
4374		if (likely(error > 0))
4375			break;
4376		if (error)
4377			goto out_dput;
4378		d_invalidate(dentry);
4379		dput(dentry);
4380		dentry = NULL;
4381	}
4382	if (dentry->d_inode) {
4383		/* Cached positive dentry: will open in f_op->open */
4384		return dentry;
4385	}
4386
4387	if (open_flag & O_CREAT)
4388		audit_inode(nd->name, dir, AUDIT_INODE_PARENT);
4389
4390	/*
4391	 * Checking write permission is tricky, bacuse we don't know if we are
4392	 * going to actually need it: O_CREAT opens should work as long as the
4393	 * file exists.  But checking existence breaks atomicity.  The trick is
4394	 * to check access and if not granted clear O_CREAT from the flags.
4395	 *
4396	 * Another problem is returing the "right" error value (e.g. for an
4397	 * O_EXCL open we want to return EEXIST not EROFS).
4398	 */
4399	if (unlikely(!got_write))
4400		open_flag &= ~O_TRUNC;
4401	idmap = mnt_idmap(nd->path.mnt);
4402	if (open_flag & O_CREAT) {
4403		if (open_flag & O_EXCL)
4404			open_flag &= ~O_TRUNC;
4405		mode = vfs_prepare_mode(idmap, dir->d_inode, mode, mode, mode);
4406		if (likely(got_write))
4407			create_error = may_o_create(idmap, &nd->path,
4408						    dentry, mode);
4409		else
4410			create_error = -EROFS;
4411	}
4412	if (create_error)
4413		open_flag &= ~O_CREAT;
4414	if (dir_inode->i_op->atomic_open) {
4415		dentry = atomic_open(nd, dentry, file, open_flag, mode);
4416		if (unlikely(create_error) && dentry == ERR_PTR(-ENOENT))
4417			dentry = ERR_PTR(create_error);
4418		return dentry;
4419	}
4420
4421	if (d_in_lookup(dentry)) {
4422		struct dentry *res = dir_inode->i_op->lookup(dir_inode, dentry,
4423							     nd->flags);
4424		d_lookup_done(dentry);
4425		if (unlikely(res)) {
4426			if (IS_ERR(res)) {
4427				error = PTR_ERR(res);
4428				goto out_dput;
4429			}
4430			dput(dentry);
4431			dentry = res;
4432		}
4433	}
4434
4435	/* Negative dentry, just create the file */
4436	if (!dentry->d_inode && (open_flag & O_CREAT)) {
4437		/* but break the directory lease first! */
4438		error = try_break_deleg(dir_inode, delegated_inode);
4439		if (error)
4440			goto out_dput;
4441
4442		file->f_mode |= FMODE_CREATED;
4443		audit_inode_child(dir_inode, dentry, AUDIT_TYPE_CHILD_CREATE);
4444		if (!dir_inode->i_op->create) {
4445			error = -EACCES;
4446			goto out_dput;
4447		}
4448
4449		error = dir_inode->i_op->create(idmap, dir_inode, dentry,
4450						mode, open_flag & O_EXCL);
4451		if (error)
4452			goto out_dput;
4453	}
4454	if (unlikely(create_error) && !dentry->d_inode) {
4455		error = create_error;
4456		goto out_dput;
4457	}
4458	return dentry;
4459
4460out_dput:
4461	dput(dentry);
4462	return ERR_PTR(error);
4463}
4464
4465static inline bool trailing_slashes(struct nameidata *nd)
4466{
4467	return (bool)nd->last.name[nd->last.len];
4468}
4469
4470static struct dentry *lookup_fast_for_open(struct nameidata *nd, int open_flag)
4471{
4472	struct dentry *dentry;
4473
4474	if (open_flag & O_CREAT) {
4475		if (trailing_slashes(nd))
4476			return ERR_PTR(-EISDIR);
4477
4478		/* Don't bother on an O_EXCL create */
4479		if (open_flag & O_EXCL)
4480			return NULL;
4481	}
4482
4483	if (trailing_slashes(nd))
4484		nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
4485
4486	dentry = lookup_fast(nd);
4487	if (IS_ERR_OR_NULL(dentry))
4488		return dentry;
4489
4490	if (open_flag & O_CREAT) {
4491		/* Discard negative dentries. Need inode_lock to do the create */
4492		if (!dentry->d_inode) {
4493			if (!(nd->flags & LOOKUP_RCU))
4494				dput(dentry);
4495			dentry = NULL;
4496		}
4497	}
4498	return dentry;
4499}
4500
4501static const char *open_last_lookups(struct nameidata *nd,
4502		   struct file *file, const struct open_flags *op)
4503{
4504	struct delegated_inode delegated_inode = { };
4505	struct dentry *dir = nd->path.dentry;
4506	int open_flag = op->open_flag;
4507	bool got_write = false;
4508	struct dentry *dentry;
4509	const char *res;
4510
4511	nd->flags |= op->intent;
4512
4513	if (nd->last_type != LAST_NORM) {
4514		if (nd->depth)
4515			put_link(nd);
4516		return handle_dots(nd, nd->last_type);
4517	}
4518
4519	/* We _can_ be in RCU mode here */
4520	dentry = lookup_fast_for_open(nd, open_flag);
4521	if (IS_ERR(dentry))
4522		return ERR_CAST(dentry);
4523
4524	if (likely(dentry))
4525		goto finish_lookup;
4526
4527	if (!(open_flag & O_CREAT)) {
4528		if (WARN_ON_ONCE(nd->flags & LOOKUP_RCU))
4529			return ERR_PTR(-ECHILD);
4530	} else {
4531		if (nd->flags & LOOKUP_RCU) {
4532			if (!try_to_unlazy(nd))
4533				return ERR_PTR(-ECHILD);
4534		}
4535	}
4536retry:
4537	if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) {
4538		got_write = !mnt_want_write(nd->path.mnt);
4539		/*
4540		 * do _not_ fail yet - we might not need that or fail with
4541		 * a different error; let lookup_open() decide; we'll be
4542		 * dropping this one anyway.
4543		 */
4544	}
4545	if (open_flag & O_CREAT)
4546		inode_lock(dir->d_inode);
4547	else
4548		inode_lock_shared(dir->d_inode);
4549	dentry = lookup_open(nd, file, op, got_write, &delegated_inode);
4550	if (!IS_ERR(dentry)) {
4551		if (file->f_mode & FMODE_CREATED)
4552			fsnotify_create(dir->d_inode, dentry);
4553		if (file->f_mode & FMODE_OPENED)
4554			fsnotify_open(file);
4555	}
4556	if (open_flag & O_CREAT)
4557		inode_unlock(dir->d_inode);
4558	else
4559		inode_unlock_shared(dir->d_inode);
4560
4561	if (got_write)
4562		mnt_drop_write(nd->path.mnt);
4563
4564	if (IS_ERR(dentry)) {
4565		if (is_delegated(&delegated_inode)) {
4566			int error = break_deleg_wait(&delegated_inode);
4567
4568			if (!error)
4569				goto retry;
4570			return ERR_PTR(error);
4571		}
4572		return ERR_CAST(dentry);
4573	}
4574
4575	if (file->f_mode & (FMODE_OPENED | FMODE_CREATED)) {
4576		dput(nd->path.dentry);
4577		nd->path.dentry = dentry;
4578		return NULL;
4579	}
4580
4581finish_lookup:
4582	if (nd->depth)
4583		put_link(nd);
4584	res = step_into(nd, WALK_TRAILING, dentry);
4585	if (unlikely(res))
4586		nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
4587	return res;
4588}
4589
4590/*
4591 * Handle the last step of open()
4592 */
4593static int do_open(struct nameidata *nd,
4594		   struct file *file, const struct open_flags *op)
4595{
4596	struct mnt_idmap *idmap;
4597	int open_flag = op->open_flag;
4598	bool do_truncate;
4599	int acc_mode;
4600	int error;
4601
4602	if (!(file->f_mode & (FMODE_OPENED | FMODE_CREATED))) {
4603		error = complete_walk(nd);
4604		if (error)
4605			return error;
4606	}
4607	if (!(file->f_mode & FMODE_CREATED))
4608		audit_inode(nd->name, nd->path.dentry, 0);
4609	idmap = mnt_idmap(nd->path.mnt);
4610	if (open_flag & O_CREAT) {
4611		if ((open_flag & O_EXCL) && !(file->f_mode & FMODE_CREATED))
4612			return -EEXIST;
4613		if (d_is_dir(nd->path.dentry))
4614			return -EISDIR;
4615		error = may_create_in_sticky(idmap, nd,
4616					     d_backing_inode(nd->path.dentry));
4617		if (unlikely(error))
4618			return error;
4619	}
4620	if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry))
4621		return -ENOTDIR;
4622
4623	do_truncate = false;
4624	acc_mode = op->acc_mode;
4625	if (file->f_mode & FMODE_CREATED) {
4626		/* Don't check for write permission, don't truncate */
4627		open_flag &= ~O_TRUNC;
4628		acc_mode = 0;
4629	} else if (d_is_reg(nd->path.dentry) && open_flag & O_TRUNC) {
4630		error = mnt_want_write(nd->path.mnt);
4631		if (error)
4632			return error;
4633		do_truncate = true;
4634	}
4635	error = may_open(idmap, &nd->path, acc_mode, open_flag);
4636	if (!error && !(file->f_mode & FMODE_OPENED))
4637		error = vfs_open(&nd->path, file);
4638	if (!error)
4639		error = security_file_post_open(file, op->acc_mode);
4640	if (!error && do_truncate)
4641		error = handle_truncate(idmap, file);
4642	if (unlikely(error > 0)) {
4643		WARN_ON(1);
4644		error = -EINVAL;
4645	}
4646	if (do_truncate)
4647		mnt_drop_write(nd->path.mnt);
4648	return error;
4649}
4650
4651/**
4652 * vfs_tmpfile - create tmpfile
4653 * @idmap:	idmap of the mount the inode was found from
4654 * @parentpath:	pointer to the path of the base directory
4655 * @file:	file descriptor of the new tmpfile
4656 * @mode:	mode of the new tmpfile
4657 *
4658 * Create a temporary file.
4659 *
4660 * If the inode has been found through an idmapped mount the idmap of
4661 * the vfsmount must be passed through @idmap. This function will then take
4662 * care to map the inode according to @idmap before checking permissions.
4663 * On non-idmapped mounts or if permission checking is to be performed on the
4664 * raw inode simply pass @nop_mnt_idmap.
4665 */
4666int vfs_tmpfile(struct mnt_idmap *idmap,
4667		const struct path *parentpath,
4668		struct file *file, umode_t mode)
4669{
4670	struct dentry *child;
4671	struct inode *dir = d_inode(parentpath->dentry);
4672	struct inode *inode;
4673	int error;
4674	int open_flag = file->f_flags;
4675
4676	/* we want directory to be writable */
4677	error = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC);
4678	if (error)
4679		return error;
4680	if (!dir->i_op->tmpfile)
4681		return -EOPNOTSUPP;
4682	child = d_alloc(parentpath->dentry, &slash_name);
4683	if (unlikely(!child))
4684		return -ENOMEM;
4685	file->__f_path.mnt = parentpath->mnt;
4686	file->__f_path.dentry = child;
4687	mode = vfs_prepare_mode(idmap, dir, mode, mode, mode);
4688	error = dir->i_op->tmpfile(idmap, dir, file, mode);
4689	dput(child);
4690	if (file->f_mode & FMODE_OPENED)
4691		fsnotify_open(file);
4692	if (error)
4693		return error;
4694	/* Don't check for other permissions, the inode was just created */
4695	error = may_open(idmap, &file->f_path, 0, file->f_flags);
4696	if (error)
4697		return error;
4698	inode = file_inode(file);
4699	if (!(open_flag & O_EXCL)) {
4700		spin_lock(&inode->i_lock);
4701		inode_state_set(inode, I_LINKABLE);
4702		spin_unlock(&inode->i_lock);
4703	}
4704	security_inode_post_create_tmpfile(idmap, inode);
4705	return 0;
4706}
4707
4708/**
4709 * kernel_tmpfile_open - open a tmpfile for kernel internal use
4710 * @idmap:	idmap of the mount the inode was found from
4711 * @parentpath:	path of the base directory
4712 * @mode:	mode of the new tmpfile
4713 * @open_flag:	flags
4714 * @cred:	credentials for open
4715 *
4716 * Create and open a temporary file.  The file is not accounted in nr_files,
4717 * hence this is only for kernel internal use, and must not be installed into
4718 * file tables or such.
4719 */
4720struct file *kernel_tmpfile_open(struct mnt_idmap *idmap,
4721				 const struct path *parentpath,
4722				 umode_t mode, int open_flag,
4723				 const struct cred *cred)
4724{
4725	struct file *file;
4726	int error;
4727
4728	file = alloc_empty_file_noaccount(open_flag, cred);
4729	if (IS_ERR(file))
4730		return file;
4731
4732	error = vfs_tmpfile(idmap, parentpath, file, mode);
4733	if (error) {
4734		fput(file);
4735		file = ERR_PTR(error);
4736	}
4737	return file;
4738}
4739EXPORT_SYMBOL(kernel_tmpfile_open);
4740
4741static int do_tmpfile(struct nameidata *nd, unsigned flags,
4742		const struct open_flags *op,
4743		struct file *file)
4744{
4745	struct path path;
4746	int error = path_lookupat(nd, flags | LOOKUP_DIRECTORY, &path);
4747
4748	if (unlikely(error))
4749		return error;
4750	error = mnt_want_write(path.mnt);
4751	if (unlikely(error))
4752		goto out;
4753	error = vfs_tmpfile(mnt_idmap(path.mnt), &path, file, op->mode);
4754	if (error)
4755		goto out2;
4756	audit_inode(nd->name, file->f_path.dentry, 0);
4757out2:
4758	mnt_drop_write(path.mnt);
4759out:
4760	path_put(&path);
4761	return error;
4762}
4763
4764static int do_o_path(struct nameidata *nd, unsigned flags, struct file *file)
4765{
4766	struct path path;
4767	int error = path_lookupat(nd, flags, &path);
4768	if (!error) {
4769		audit_inode(nd->name, path.dentry, 0);
4770		error = vfs_open(&path, file);
4771		path_put(&path);
4772	}
4773	return error;
4774}
4775
4776static struct file *path_openat(struct nameidata *nd,
4777			const struct open_flags *op, unsigned flags)
4778{
4779	struct file *file;
4780	int error;
4781
4782	file = alloc_empty_file(op->open_flag, current_cred());
4783	if (IS_ERR(file))
4784		return file;
4785
4786	if (unlikely(file->f_flags & __O_TMPFILE)) {
4787		error = do_tmpfile(nd, flags, op, file);
4788	} else if (unlikely(file->f_flags & O_PATH)) {
4789		error = do_o_path(nd, flags, file);
4790	} else {
4791		const char *s = path_init(nd, flags);
4792		while (!(error = link_path_walk(s, nd)) &&
4793		       (s = open_last_lookups(nd, file, op)) != NULL)
4794			;
4795		if (!error)
4796			error = do_open(nd, file, op);
4797		terminate_walk(nd);
4798	}
4799	if (likely(!error)) {
4800		if (likely(file->f_mode & FMODE_OPENED))
4801			return file;
4802		WARN_ON(1);
4803		error = -EINVAL;
4804	}
4805	fput_close(file);
4806	if (error == -EOPENSTALE) {
4807		if (flags & LOOKUP_RCU)
4808			error = -ECHILD;
4809		else
4810			error = -ESTALE;
4811	}
4812	return ERR_PTR(error);
4813}
4814
4815struct file *do_filp_open(int dfd, struct filename *pathname,
4816		const struct open_flags *op)
4817{
4818	struct nameidata nd;
4819	int flags = op->lookup_flags;
4820	struct file *filp;
4821
4822	set_nameidata(&nd, dfd, pathname, NULL);
4823	filp = path_openat(&nd, op, flags | LOOKUP_RCU);
4824	if (unlikely(filp == ERR_PTR(-ECHILD)))
4825		filp = path_openat(&nd, op, flags);
4826	if (unlikely(filp == ERR_PTR(-ESTALE)))
4827		filp = path_openat(&nd, op, flags | LOOKUP_REVAL);
4828	restore_nameidata();
4829	return filp;
4830}
4831
4832struct file *do_file_open_root(const struct path *root,
4833		const char *name, const struct open_flags *op)
4834{
4835	struct nameidata nd;
4836	struct file *file;
4837	struct filename *filename;
4838	int flags = op->lookup_flags;
4839
4840	if (d_is_symlink(root->dentry) && op->intent & LOOKUP_OPEN)
4841		return ERR_PTR(-ELOOP);
4842
4843	filename = getname_kernel(name);
4844	if (IS_ERR(filename))
4845		return ERR_CAST(filename);
4846
4847	set_nameidata(&nd, -1, filename, root);
4848	file = path_openat(&nd, op, flags | LOOKUP_RCU);
4849	if (unlikely(file == ERR_PTR(-ECHILD)))
4850		file = path_openat(&nd, op, flags);
4851	if (unlikely(file == ERR_PTR(-ESTALE)))
4852		file = path_openat(&nd, op, flags | LOOKUP_REVAL);
4853	restore_nameidata();
4854	putname(filename);
4855	return file;
4856}
4857
4858static struct dentry *filename_create(int dfd, struct filename *name,
4859				      struct path *path, unsigned int lookup_flags)
4860{
4861	struct dentry *dentry = ERR_PTR(-EEXIST);
4862	struct qstr last;
4863	bool want_dir = lookup_flags & LOOKUP_DIRECTORY;
4864	unsigned int reval_flag = lookup_flags & LOOKUP_REVAL;
4865	unsigned int create_flags = LOOKUP_CREATE | LOOKUP_EXCL;
4866	int type;
4867	int error;
4868
4869	error = filename_parentat(dfd, name, reval_flag, path, &last, &type);
4870	if (error)
4871		return ERR_PTR(error);
4872
4873	/*
4874	 * Yucky last component or no last component at all?
4875	 * (foo/., foo/.., /////)
4876	 */
4877	if (unlikely(type != LAST_NORM))
4878		goto out;
4879
4880	/* don't fail immediately if it's r/o, at least try to report other errors */
4881	error = mnt_want_write(path->mnt);
4882	/*
4883	 * Do the final lookup.  Suppress 'create' if there is a trailing
4884	 * '/', and a directory wasn't requested.
4885	 */
4886	if (last.name[last.len] && !want_dir)
4887		create_flags &= ~LOOKUP_CREATE;
4888	dentry = start_dirop(path->dentry, &last, reval_flag | create_flags);
4889	if (IS_ERR(dentry))
4890		goto out_drop_write;
4891
4892	if (unlikely(error))
4893		goto fail;
4894
4895	return dentry;
4896fail:
4897	end_dirop(dentry);
4898	dentry = ERR_PTR(error);
4899out_drop_write:
4900	if (!error)
4901		mnt_drop_write(path->mnt);
4902out:
4903	path_put(path);
4904	return dentry;
4905}
4906
4907struct dentry *start_creating_path(int dfd, const char *pathname,
4908				   struct path *path, unsigned int lookup_flags)
4909{
4910	struct filename *filename = getname_kernel(pathname);
4911	struct dentry *res = filename_create(dfd, filename, path, lookup_flags);
4912
4913	putname(filename);
4914	return res;
4915}
4916EXPORT_SYMBOL(start_creating_path);
4917
4918/**
4919 * end_creating_path - finish a code section started by start_creating_path()
4920 * @path: the path instantiated by start_creating_path()
4921 * @dentry: the dentry returned by start_creating_path()
4922 *
4923 * end_creating_path() will unlock and locks taken by start_creating_path()
4924 * and drop an references that were taken.  It should only be called
4925 * if start_creating_path() returned a non-error.
4926 * If vfs_mkdir() was called and it returned an error, that error *should*
4927 * be passed to end_creating_path() together with the path.
4928 */
4929void end_creating_path(const struct path *path, struct dentry *dentry)
4930{
4931	end_creating(dentry);
4932	mnt_drop_write(path->mnt);
4933	path_put(path);
4934}
4935EXPORT_SYMBOL(end_creating_path);
4936
4937inline struct dentry *start_creating_user_path(
4938	int dfd, const char __user *pathname,
4939	struct path *path, unsigned int lookup_flags)
4940{
4941	struct filename *filename = getname(pathname);
4942	struct dentry *res = filename_create(dfd, filename, path, lookup_flags);
4943
4944	putname(filename);
4945	return res;
4946}
4947EXPORT_SYMBOL(start_creating_user_path);
4948
4949
4950/**
4951 * vfs_mknod - create device node or file
4952 * @idmap:		idmap of the mount the inode was found from
4953 * @dir:		inode of the parent directory
4954 * @dentry:		dentry of the child device node
4955 * @mode:		mode of the child device node
4956 * @dev:		device number of device to create
4957 * @delegated_inode:	returns parent inode, if the inode is delegated.
4958 *
4959 * Create a device node or file.
4960 *
4961 * If the inode has been found through an idmapped mount the idmap of
4962 * the vfsmount must be passed through @idmap. This function will then take
4963 * care to map the inode according to @idmap before checking permissions.
4964 * On non-idmapped mounts or if permission checking is to be performed on the
4965 * raw inode simply pass @nop_mnt_idmap.
4966 */
4967int vfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
4968	      struct dentry *dentry, umode_t mode, dev_t dev,
4969	      struct delegated_inode *delegated_inode)
4970{
4971	bool is_whiteout = S_ISCHR(mode) && dev == WHITEOUT_DEV;
4972	int error = may_create(idmap, dir, dentry);
4973
4974	if (error)
4975		return error;
4976
4977	if ((S_ISCHR(mode) || S_ISBLK(mode)) && !is_whiteout &&
4978	    !capable(CAP_MKNOD))
4979		return -EPERM;
4980
4981	if (!dir->i_op->mknod)
4982		return -EPERM;
4983
4984	mode = vfs_prepare_mode(idmap, dir, mode, mode, mode);
4985	error = devcgroup_inode_mknod(mode, dev);
4986	if (error)
4987		return error;
4988
4989	error = security_inode_mknod(dir, dentry, mode, dev);
4990	if (error)
4991		return error;
4992
4993	error = try_break_deleg(dir, delegated_inode);
4994	if (error)
4995		return error;
4996
4997	error = dir->i_op->mknod(idmap, dir, dentry, mode, dev);
4998	if (!error)
4999		fsnotify_create(dir, dentry);
5000	return error;
5001}
5002EXPORT_SYMBOL(vfs_mknod);
5003
5004static int may_mknod(umode_t mode)
5005{
5006	switch (mode & S_IFMT) {
5007	case S_IFREG:
5008	case S_IFCHR:
5009	case S_IFBLK:
5010	case S_IFIFO:
5011	case S_IFSOCK:
5012	case 0: /* zero mode translates to S_IFREG */
5013		return 0;
5014	case S_IFDIR:
5015		return -EPERM;
5016	default:
5017		return -EINVAL;
5018	}
5019}
5020
5021static int do_mknodat(int dfd, struct filename *name, umode_t mode,
5022		unsigned int dev)
5023{
5024	struct delegated_inode di = { };
5025	struct mnt_idmap *idmap;
5026	struct dentry *dentry;
5027	struct path path;
5028	int error;
5029	unsigned int lookup_flags = 0;
5030
5031	error = may_mknod(mode);
5032	if (error)
5033		goto out1;
5034retry:
5035	dentry = filename_create(dfd, name, &path, lookup_flags);
5036	error = PTR_ERR(dentry);
5037	if (IS_ERR(dentry))
5038		goto out1;
5039
5040	error = security_path_mknod(&path, dentry,
5041			mode_strip_umask(path.dentry->d_inode, mode), dev);
5042	if (error)
5043		goto out2;
5044
5045	idmap = mnt_idmap(path.mnt);
5046	switch (mode & S_IFMT) {
5047		case 0: case S_IFREG:
5048			error = vfs_create(idmap, dentry, mode, &di);
5049			if (!error)
5050				security_path_post_mknod(idmap, dentry);
5051			break;
5052		case S_IFCHR: case S_IFBLK:
5053			error = vfs_mknod(idmap, path.dentry->d_inode,
5054					  dentry, mode, new_decode_dev(dev), &di);
5055			break;
5056		case S_IFIFO: case S_IFSOCK:
5057			error = vfs_mknod(idmap, path.dentry->d_inode,
5058					  dentry, mode, 0, &di);
5059			break;
5060	}
5061out2:
5062	end_creating_path(&path, dentry);
5063	if (is_delegated(&di)) {
5064		error = break_deleg_wait(&di);
5065		if (!error)
5066			goto retry;
5067	}
5068	if (retry_estale(error, lookup_flags)) {
5069		lookup_flags |= LOOKUP_REVAL;
5070		goto retry;
5071	}
5072out1:
5073	putname(name);
5074	return error;
5075}
5076
5077SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
5078		unsigned int, dev)
5079{
5080	return do_mknodat(dfd, getname(filename), mode, dev);
5081}
5082
5083SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, dev)
5084{
5085	return do_mknodat(AT_FDCWD, getname(filename), mode, dev);
5086}
5087
5088/**
5089 * vfs_mkdir - create directory returning correct dentry if possible
5090 * @idmap:		idmap of the mount the inode was found from
5091 * @dir:		inode of the parent directory
5092 * @dentry:		dentry of the child directory
5093 * @mode:		mode of the child directory
5094 * @delegated_inode:	returns parent inode, if the inode is delegated.
5095 *
5096 * Create a directory.
5097 *
5098 * If the inode has been found through an idmapped mount the idmap of
5099 * the vfsmount must be passed through @idmap. This function will then take
5100 * care to map the inode according to @idmap before checking permissions.
5101 * On non-idmapped mounts or if permission checking is to be performed on the
5102 * raw inode simply pass @nop_mnt_idmap.
5103 *
5104 * In the event that the filesystem does not use the *@dentry but leaves it
5105 * negative or unhashes it and possibly splices a different one returning it,
5106 * the original dentry is dput() and the alternate is returned.
5107 *
5108 * In case of an error the dentry is dput() and an ERR_PTR() is returned.
5109 */
5110struct dentry *vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
5111			 struct dentry *dentry, umode_t mode,
5112			 struct delegated_inode *delegated_inode)
5113{
5114	int error;
5115	unsigned max_links = dir->i_sb->s_max_links;
5116	struct dentry *de;
5117
5118	error = may_create(idmap, dir, dentry);
5119	if (error)
5120		goto err;
5121
5122	error = -EPERM;
5123	if (!dir->i_op->mkdir)
5124		goto err;
5125
5126	mode = vfs_prepare_mode(idmap, dir, mode, S_IRWXUGO | S_ISVTX, 0);
5127	error = security_inode_mkdir(dir, dentry, mode);
5128	if (error)
5129		goto err;
5130
5131	error = -EMLINK;
5132	if (max_links && dir->i_nlink >= max_links)
5133		goto err;
5134
5135	error = try_break_deleg(dir, delegated_inode);
5136	if (error)
5137		goto err;
5138
5139	de = dir->i_op->mkdir(idmap, dir, dentry, mode);
5140	error = PTR_ERR(de);
5141	if (IS_ERR(de))
5142		goto err;
5143	if (de) {
5144		dput(dentry);
5145		dentry = de;
5146	}
5147	fsnotify_mkdir(dir, dentry);
5148	return dentry;
5149
5150err:
5151	end_creating(dentry);
5152	return ERR_PTR(error);
5153}
5154EXPORT_SYMBOL(vfs_mkdir);
5155
5156int do_mkdirat(int dfd, struct filename *name, umode_t mode)
5157{
5158	struct dentry *dentry;
5159	struct path path;
5160	int error;
5161	unsigned int lookup_flags = LOOKUP_DIRECTORY;
5162	struct delegated_inode delegated_inode = { };
5163
5164retry:
5165	dentry = filename_create(dfd, name, &path, lookup_flags);
5166	error = PTR_ERR(dentry);
5167	if (IS_ERR(dentry))
5168		goto out_putname;
5169
5170	error = security_path_mkdir(&path, dentry,
5171			mode_strip_umask(path.dentry->d_inode, mode));
5172	if (!error) {
5173		dentry = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode,
5174				   dentry, mode, &delegated_inode);
5175		if (IS_ERR(dentry))
5176			error = PTR_ERR(dentry);
5177	}
5178	end_creating_path(&path, dentry);
5179	if (is_delegated(&delegated_inode)) {
5180		error = break_deleg_wait(&delegated_inode);
5181		if (!error)
5182			goto retry;
5183	}
5184	if (retry_estale(error, lookup_flags)) {
5185		lookup_flags |= LOOKUP_REVAL;
5186		goto retry;
5187	}
5188out_putname:
5189	putname(name);
5190	return error;
5191}
5192
5193SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
5194{
5195	return do_mkdirat(dfd, getname(pathname), mode);
5196}
5197
5198SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode)
5199{
5200	return do_mkdirat(AT_FDCWD, getname(pathname), mode);
5201}
5202
5203/**
5204 * vfs_rmdir - remove directory
5205 * @idmap:		idmap of the mount the inode was found from
5206 * @dir:		inode of the parent directory
5207 * @dentry:		dentry of the child directory
5208 * @delegated_inode:	returns parent inode, if it's delegated.
5209 *
5210 * Remove a directory.
5211 *
5212 * If the inode has been found through an idmapped mount the idmap of
5213 * the vfsmount must be passed through @idmap. This function will then take
5214 * care to map the inode according to @idmap before checking permissions.
5215 * On non-idmapped mounts or if permission checking is to be performed on the
5216 * raw inode simply pass @nop_mnt_idmap.
5217 */
5218int vfs_rmdir(struct mnt_idmap *idmap, struct inode *dir,
5219	      struct dentry *dentry, struct delegated_inode *delegated_inode)
5220{
5221	int error = may_delete(idmap, dir, dentry, 1);
5222
5223	if (error)
5224		return error;
5225
5226	if (!dir->i_op->rmdir)
5227		return -EPERM;
5228
5229	dget(dentry);
5230	inode_lock(dentry->d_inode);
5231
5232	error = -EBUSY;
5233	if (is_local_mountpoint(dentry) ||
5234	    (dentry->d_inode->i_flags & S_KERNEL_FILE))
5235		goto out;
5236
5237	error = security_inode_rmdir(dir, dentry);
5238	if (error)
5239		goto out;
5240
5241	error = try_break_deleg(dir, delegated_inode);
5242	if (error)
5243		goto out;
5244
5245	error = dir->i_op->rmdir(dir, dentry);
5246	if (error)
5247		goto out;
5248
5249	shrink_dcache_parent(dentry);
5250	dentry->d_inode->i_flags |= S_DEAD;
5251	dont_mount(dentry);
5252	detach_mounts(dentry);
5253
5254out:
5255	inode_unlock(dentry->d_inode);
5256	dput(dentry);
5257	if (!error)
5258		d_delete_notify(dir, dentry);
5259	return error;
5260}
5261EXPORT_SYMBOL(vfs_rmdir);
5262
5263int do_rmdir(int dfd, struct filename *name)
5264{
5265	int error;
5266	struct dentry *dentry;
5267	struct path path;
5268	struct qstr last;
5269	int type;
5270	unsigned int lookup_flags = 0;
5271	struct delegated_inode delegated_inode = { };
5272retry:
5273	error = filename_parentat(dfd, name, lookup_flags, &path, &last, &type);
5274	if (error)
5275		goto exit1;
5276
5277	switch (type) {
5278	case LAST_DOTDOT:
5279		error = -ENOTEMPTY;
5280		goto exit2;
5281	case LAST_DOT:
5282		error = -EINVAL;
5283		goto exit2;
5284	case LAST_ROOT:
5285		error = -EBUSY;
5286		goto exit2;
5287	}
5288
5289	error = mnt_want_write(path.mnt);
5290	if (error)
5291		goto exit2;
5292
5293	dentry = start_dirop(path.dentry, &last, lookup_flags);
5294	error = PTR_ERR(dentry);
5295	if (IS_ERR(dentry))
5296		goto exit3;
5297	error = security_path_rmdir(&path, dentry);
5298	if (error)
5299		goto exit4;
5300	error = vfs_rmdir(mnt_idmap(path.mnt), path.dentry->d_inode,
5301			  dentry, &delegated_inode);
5302exit4:
5303	end_dirop(dentry);
5304exit3:
5305	mnt_drop_write(path.mnt);
5306exit2:
5307	path_put(&path);
5308	if (is_delegated(&delegated_inode)) {
5309		error = break_deleg_wait(&delegated_inode);
5310		if (!error)
5311			goto retry;
5312	}
5313	if (retry_estale(error, lookup_flags)) {
5314		lookup_flags |= LOOKUP_REVAL;
5315		goto retry;
5316	}
5317exit1:
5318	putname(name);
5319	return error;
5320}
5321
5322SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
5323{
5324	return do_rmdir(AT_FDCWD, getname(pathname));
5325}
5326
5327/**
5328 * vfs_unlink - unlink a filesystem object
5329 * @idmap:	idmap of the mount the inode was found from
5330 * @dir:	parent directory
5331 * @dentry:	victim
5332 * @delegated_inode: returns victim inode, if the inode is delegated.
5333 *
5334 * The caller must hold dir->i_rwsem exclusively.
5335 *
5336 * If vfs_unlink discovers a delegation, it will return -EWOULDBLOCK and
5337 * return a reference to the inode in delegated_inode.  The caller
5338 * should then break the delegation on that inode and retry.  Because
5339 * breaking a delegation may take a long time, the caller should drop
5340 * dir->i_rwsem before doing so.
5341 *
5342 * Alternatively, a caller may pass NULL for delegated_inode.  This may
5343 * be appropriate for callers that expect the underlying filesystem not
5344 * to be NFS exported.
5345 *
5346 * If the inode has been found through an idmapped mount the idmap of
5347 * the vfsmount must be passed through @idmap. This function will then take
5348 * care to map the inode according to @idmap before checking permissions.
5349 * On non-idmapped mounts or if permission checking is to be performed on the
5350 * raw inode simply pass @nop_mnt_idmap.
5351 */
5352int vfs_unlink(struct mnt_idmap *idmap, struct inode *dir,
5353	       struct dentry *dentry, struct delegated_inode *delegated_inode)
5354{
5355	struct inode *target = dentry->d_inode;
5356	int error = may_delete(idmap, dir, dentry, 0);
5357
5358	if (error)
5359		return error;
5360
5361	if (!dir->i_op->unlink)
5362		return -EPERM;
5363
5364	inode_lock(target);
5365	if (IS_SWAPFILE(target))
5366		error = -EPERM;
5367	else if (is_local_mountpoint(dentry))
5368		error = -EBUSY;
5369	else {
5370		error = security_inode_unlink(dir, dentry);
5371		if (!error) {
5372			error = try_break_deleg(dir, delegated_inode);
5373			if (error)
5374				goto out;
5375			error = try_break_deleg(target, delegated_inode);
5376			if (error)
5377				goto out;
5378			error = dir->i_op->unlink(dir, dentry);
5379			if (!error) {
5380				dont_mount(dentry);
5381				detach_mounts(dentry);
5382			}
5383		}
5384	}
5385out:
5386	inode_unlock(target);
5387
5388	/* We don't d_delete() NFS sillyrenamed files--they still exist. */
5389	if (!error && dentry->d_flags & DCACHE_NFSFS_RENAMED) {
5390		fsnotify_unlink(dir, dentry);
5391	} else if (!error) {
5392		fsnotify_link_count(target);
5393		d_delete_notify(dir, dentry);
5394	}
5395
5396	return error;
5397}
5398EXPORT_SYMBOL(vfs_unlink);
5399
5400/*
5401 * Make sure that the actual truncation of the file will occur outside its
5402 * directory's i_rwsem.  Truncate can take a long time if there is a lot of
5403 * writeout happening, and we don't want to prevent access to the directory
5404 * while waiting on the I/O.
5405 */
5406int do_unlinkat(int dfd, struct filename *name)
5407{
5408	int error;
5409	struct dentry *dentry;
5410	struct path path;
5411	struct qstr last;
5412	int type;
5413	struct inode *inode;
5414	struct delegated_inode delegated_inode = { };
5415	unsigned int lookup_flags = 0;
5416retry:
5417	error = filename_parentat(dfd, name, lookup_flags, &path, &last, &type);
5418	if (error)
5419		goto exit_putname;
5420
5421	error = -EISDIR;
5422	if (type != LAST_NORM)
5423		goto exit_path_put;
5424
5425	error = mnt_want_write(path.mnt);
5426	if (error)
5427		goto exit_path_put;
5428retry_deleg:
5429	dentry = start_dirop(path.dentry, &last, lookup_flags);
5430	error = PTR_ERR(dentry);
5431	if (IS_ERR(dentry))
5432		goto exit_drop_write;
5433
5434	/* Why not before? Because we want correct error value */
5435	if (unlikely(last.name[last.len])) {
5436		if (d_is_dir(dentry))
5437			error = -EISDIR;
5438		else
5439			error = -ENOTDIR;
5440		end_dirop(dentry);
5441		goto exit_drop_write;
5442	}
5443	inode = dentry->d_inode;
5444	ihold(inode);
5445	error = security_path_unlink(&path, dentry);
5446	if (error)
5447		goto exit_end_dirop;
5448	error = vfs_unlink(mnt_idmap(path.mnt), path.dentry->d_inode,
5449			   dentry, &delegated_inode);
5450exit_end_dirop:
5451	end_dirop(dentry);
5452	iput(inode);	/* truncate the inode here */
5453	if (is_delegated(&delegated_inode)) {
5454		error = break_deleg_wait(&delegated_inode);
5455		if (!error)
5456			goto retry_deleg;
5457	}
5458exit_drop_write:
5459	mnt_drop_write(path.mnt);
5460exit_path_put:
5461	path_put(&path);
5462	if (retry_estale(error, lookup_flags)) {
5463		lookup_flags |= LOOKUP_REVAL;
5464		goto retry;
5465	}
5466exit_putname:
5467	putname(name);
5468	return error;
5469}
5470
5471SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag)
5472{
5473	if ((flag & ~AT_REMOVEDIR) != 0)
5474		return -EINVAL;
5475
5476	if (flag & AT_REMOVEDIR)
5477		return do_rmdir(dfd, getname(pathname));
5478	return do_unlinkat(dfd, getname(pathname));
5479}
5480
5481SYSCALL_DEFINE1(unlink, const char __user *, pathname)
5482{
5483	return do_unlinkat(AT_FDCWD, getname(pathname));
5484}
5485
5486/**
5487 * vfs_symlink - create symlink
5488 * @idmap:	idmap of the mount the inode was found from
5489 * @dir:	inode of the parent directory
5490 * @dentry:	dentry of the child symlink file
5491 * @oldname:	name of the file to link to
5492 * @delegated_inode: returns victim inode, if the inode is delegated.
5493 *
5494 * Create a symlink.
5495 *
5496 * If the inode has been found through an idmapped mount the idmap of
5497 * the vfsmount must be passed through @idmap. This function will then take
5498 * care to map the inode according to @idmap before checking permissions.
5499 * On non-idmapped mounts or if permission checking is to be performed on the
5500 * raw inode simply pass @nop_mnt_idmap.
5501 */
5502int vfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
5503		struct dentry *dentry, const char *oldname,
5504		struct delegated_inode *delegated_inode)
5505{
5506	int error;
5507
5508	error = may_create(idmap, dir, dentry);
5509	if (error)
5510		return error;
5511
5512	if (!dir->i_op->symlink)
5513		return -EPERM;
5514
5515	error = security_inode_symlink(dir, dentry, oldname);
5516	if (error)
5517		return error;
5518
5519	error = try_break_deleg(dir, delegated_inode);
5520	if (error)
5521		return error;
5522
5523	error = dir->i_op->symlink(idmap, dir, dentry, oldname);
5524	if (!error)
5525		fsnotify_create(dir, dentry);
5526	return error;
5527}
5528EXPORT_SYMBOL(vfs_symlink);
5529
5530int do_symlinkat(struct filename *from, int newdfd, struct filename *to)
5531{
5532	int error;
5533	struct dentry *dentry;
5534	struct path path;
5535	unsigned int lookup_flags = 0;
5536	struct delegated_inode delegated_inode = { };
5537
5538	if (IS_ERR(from)) {
5539		error = PTR_ERR(from);
5540		goto out_putnames;
5541	}
5542retry:
5543	dentry = filename_create(newdfd, to, &path, lookup_flags);
5544	error = PTR_ERR(dentry);
5545	if (IS_ERR(dentry))
5546		goto out_putnames;
5547
5548	error = security_path_symlink(&path, dentry, from->name);
5549	if (!error)
5550		error = vfs_symlink(mnt_idmap(path.mnt), path.dentry->d_inode,
5551				    dentry, from->name, &delegated_inode);
5552	end_creating_path(&path, dentry);
5553	if (is_delegated(&delegated_inode)) {
5554		error = break_deleg_wait(&delegated_inode);
5555		if (!error)
5556			goto retry;
5557	}
5558	if (retry_estale(error, lookup_flags)) {
5559		lookup_flags |= LOOKUP_REVAL;
5560		goto retry;
5561	}
5562out_putnames:
5563	putname(to);
5564	putname(from);
5565	return error;
5566}
5567
5568SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
5569		int, newdfd, const char __user *, newname)
5570{
5571	return do_symlinkat(getname(oldname), newdfd, getname(newname));
5572}
5573
5574SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newname)
5575{
5576	return do_symlinkat(getname(oldname), AT_FDCWD, getname(newname));
5577}
5578
5579/**
5580 * vfs_link - create a new link
5581 * @old_dentry:	object to be linked
5582 * @idmap:	idmap of the mount
5583 * @dir:	new parent
5584 * @new_dentry:	where to create the new link
5585 * @delegated_inode: returns inode needing a delegation break
5586 *
5587 * The caller must hold dir->i_rwsem exclusively.
5588 *
5589 * If vfs_link discovers a delegation on the to-be-linked file in need
5590 * of breaking, it will return -EWOULDBLOCK and return a reference to the
5591 * inode in delegated_inode.  The caller should then break the delegation
5592 * and retry.  Because breaking a delegation may take a long time, the
5593 * caller should drop the i_rwsem before doing so.
5594 *
5595 * Alternatively, a caller may pass NULL for delegated_inode.  This may
5596 * be appropriate for callers that expect the underlying filesystem not
5597 * to be NFS exported.
5598 *
5599 * If the inode has been found through an idmapped mount the idmap of
5600 * the vfsmount must be passed through @idmap. This function will then take
5601 * care to map the inode according to @idmap before checking permissions.
5602 * On non-idmapped mounts or if permission checking is to be performed on the
5603 * raw inode simply pass @nop_mnt_idmap.
5604 */
5605int vfs_link(struct dentry *old_dentry, struct mnt_idmap *idmap,
5606	     struct inode *dir, struct dentry *new_dentry,
5607	     struct delegated_inode *delegated_inode)
5608{
5609	struct inode *inode = old_dentry->d_inode;
5610	unsigned max_links = dir->i_sb->s_max_links;
5611	int error;
5612
5613	if (!inode)
5614		return -ENOENT;
5615
5616	error = may_create(idmap, dir, new_dentry);
5617	if (error)
5618		return error;
5619
5620	if (dir->i_sb != inode->i_sb)
5621		return -EXDEV;
5622
5623	/*
5624	 * A link to an append-only or immutable file cannot be created.
5625	 */
5626	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
5627		return -EPERM;
5628	/*
5629	 * Updating the link count will likely cause i_uid and i_gid to
5630	 * be written back improperly if their true value is unknown to
5631	 * the vfs.
5632	 */
5633	if (HAS_UNMAPPED_ID(idmap, inode))
5634		return -EPERM;
5635	if (!dir->i_op->link)
5636		return -EPERM;
5637	if (S_ISDIR(inode->i_mode))
5638		return -EPERM;
5639
5640	error = security_inode_link(old_dentry, dir, new_dentry);
5641	if (error)
5642		return error;
5643
5644	inode_lock(inode);
5645	/* Make sure we don't allow creating hardlink to an unlinked file */
5646	if (inode->i_nlink == 0 && !(inode_state_read_once(inode) & I_LINKABLE))
5647		error =  -ENOENT;
5648	else if (max_links && inode->i_nlink >= max_links)
5649		error = -EMLINK;
5650	else {
5651		error = try_break_deleg(dir, delegated_inode);
5652		if (!error)
5653			error = try_break_deleg(inode, delegated_inode);
5654		if (!error)
5655			error = dir->i_op->link(old_dentry, dir, new_dentry);
5656	}
5657
5658	if (!error && (inode_state_read_once(inode) & I_LINKABLE)) {
5659		spin_lock(&inode->i_lock);
5660		inode_state_clear(inode, I_LINKABLE);
5661		spin_unlock(&inode->i_lock);
5662	}
5663	inode_unlock(inode);
5664	if (!error)
5665		fsnotify_link(dir, inode, new_dentry);
5666	return error;
5667}
5668EXPORT_SYMBOL(vfs_link);
5669
5670/*
5671 * Hardlinks are often used in delicate situations.  We avoid
5672 * security-related surprises by not following symlinks on the
5673 * newname.  --KAB
5674 *
5675 * We don't follow them on the oldname either to be compatible
5676 * with linux 2.0, and to avoid hard-linking to directories
5677 * and other special files.  --ADM
5678 */
5679int do_linkat(int olddfd, struct filename *old, int newdfd,
5680	      struct filename *new, int flags)
5681{
5682	struct mnt_idmap *idmap;
5683	struct dentry *new_dentry;
5684	struct path old_path, new_path;
5685	struct delegated_inode delegated_inode = { };
5686	int how = 0;
5687	int error;
5688
5689	if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0) {
5690		error = -EINVAL;
5691		goto out_putnames;
5692	}
5693	/*
5694	 * To use null names we require CAP_DAC_READ_SEARCH or
5695	 * that the open-time creds of the dfd matches current.
5696	 * This ensures that not everyone will be able to create
5697	 * a hardlink using the passed file descriptor.
5698	 */
5699	if (flags & AT_EMPTY_PATH)
5700		how |= LOOKUP_LINKAT_EMPTY;
5701
5702	if (flags & AT_SYMLINK_FOLLOW)
5703		how |= LOOKUP_FOLLOW;
5704retry:
5705	error = filename_lookup(olddfd, old, how, &old_path, NULL);
5706	if (error)
5707		goto out_putnames;
5708
5709	new_dentry = filename_create(newdfd, new, &new_path,
5710					(how & LOOKUP_REVAL));
5711	error = PTR_ERR(new_dentry);
5712	if (IS_ERR(new_dentry))
5713		goto out_putpath;
5714
5715	error = -EXDEV;
5716	if (old_path.mnt != new_path.mnt)
5717		goto out_dput;
5718	idmap = mnt_idmap(new_path.mnt);
5719	error = may_linkat(idmap, &old_path);
5720	if (unlikely(error))
5721		goto out_dput;
5722	error = security_path_link(old_path.dentry, &new_path, new_dentry);
5723	if (error)
5724		goto out_dput;
5725	error = vfs_link(old_path.dentry, idmap, new_path.dentry->d_inode,
5726			 new_dentry, &delegated_inode);
5727out_dput:
5728	end_creating_path(&new_path, new_dentry);
5729	if (is_delegated(&delegated_inode)) {
5730		error = break_deleg_wait(&delegated_inode);
5731		if (!error) {
5732			path_put(&old_path);
5733			goto retry;
5734		}
5735	}
5736	if (retry_estale(error, how)) {
5737		path_put(&old_path);
5738		how |= LOOKUP_REVAL;
5739		goto retry;
5740	}
5741out_putpath:
5742	path_put(&old_path);
5743out_putnames:
5744	putname(old);
5745	putname(new);
5746
5747	return error;
5748}
5749
5750SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
5751		int, newdfd, const char __user *, newname, int, flags)
5752{
5753	return do_linkat(olddfd, getname_uflags(oldname, flags),
5754		newdfd, getname(newname), flags);
5755}
5756
5757SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname)
5758{
5759	return do_linkat(AT_FDCWD, getname(oldname), AT_FDCWD, getname(newname), 0);
5760}
5761
5762/**
5763 * vfs_rename - rename a filesystem object
5764 * @rd:		pointer to &struct renamedata info
5765 *
5766 * The caller must hold multiple mutexes--see lock_rename()).
5767 *
5768 * If vfs_rename discovers a delegation in need of breaking at either
5769 * the source or destination, it will return -EWOULDBLOCK and return a
5770 * reference to the inode in delegated_inode.  The caller should then
5771 * break the delegation and retry.  Because breaking a delegation may
5772 * take a long time, the caller should drop all locks before doing
5773 * so.
5774 *
5775 * Alternatively, a caller may pass NULL for delegated_inode.  This may
5776 * be appropriate for callers that expect the underlying filesystem not
5777 * to be NFS exported.
5778 *
5779 * The worst of all namespace operations - renaming directory. "Perverted"
5780 * doesn't even start to describe it. Somebody in UCB had a heck of a trip...
5781 * Problems:
5782 *
5783 *	a) we can get into loop creation.
5784 *	b) race potential - two innocent renames can create a loop together.
5785 *	   That's where 4.4BSD screws up. Current fix: serialization on
5786 *	   sb->s_vfs_rename_mutex. We might be more accurate, but that's another
5787 *	   story.
5788 *	c) we may have to lock up to _four_ objects - parents and victim (if it exists),
5789 *	   and source (if it's a non-directory or a subdirectory that moves to
5790 *	   different parent).
5791 *	   And that - after we got ->i_rwsem on parents (until then we don't know
5792 *	   whether the target exists).  Solution: try to be smart with locking
5793 *	   order for inodes.  We rely on the fact that tree topology may change
5794 *	   only under ->s_vfs_rename_mutex _and_ that parent of the object we
5795 *	   move will be locked.  Thus we can rank directories by the tree
5796 *	   (ancestors first) and rank all non-directories after them.
5797 *	   That works since everybody except rename does "lock parent, lookup,
5798 *	   lock child" and rename is under ->s_vfs_rename_mutex.
5799 *	   HOWEVER, it relies on the assumption that any object with ->lookup()
5800 *	   has no more than 1 dentry.  If "hybrid" objects will ever appear,
5801 *	   we'd better make sure that there's no link(2) for them.
5802 *	d) conversion from fhandle to dentry may come in the wrong moment - when
5803 *	   we are removing the target. Solution: we will have to grab ->i_rwsem
5804 *	   in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
5805 *	   ->i_rwsem on parents, which works but leads to some truly excessive
5806 *	   locking].
5807 */
5808int vfs_rename(struct renamedata *rd)
5809{
5810	int error;
5811	struct inode *old_dir = d_inode(rd->old_parent);
5812	struct inode *new_dir = d_inode(rd->new_parent);
5813	struct dentry *old_dentry = rd->old_dentry;
5814	struct dentry *new_dentry = rd->new_dentry;
5815	struct delegated_inode *delegated_inode = rd->delegated_inode;
5816	unsigned int flags = rd->flags;
5817	bool is_dir = d_is_dir(old_dentry);
5818	struct inode *source = old_dentry->d_inode;
5819	struct inode *target = new_dentry->d_inode;
5820	bool new_is_dir = false;
5821	unsigned max_links = new_dir->i_sb->s_max_links;
5822	struct name_snapshot old_name;
5823	bool lock_old_subdir, lock_new_subdir;
5824
5825	if (source == target)
5826		return 0;
5827
5828	error = may_delete(rd->mnt_idmap, old_dir, old_dentry, is_dir);
5829	if (error)
5830		return error;
5831
5832	if (!target) {
5833		error = may_create(rd->mnt_idmap, new_dir, new_dentry);
5834	} else {
5835		new_is_dir = d_is_dir(new_dentry);
5836
5837		if (!(flags & RENAME_EXCHANGE))
5838			error = may_delete(rd->mnt_idmap, new_dir,
5839					   new_dentry, is_dir);
5840		else
5841			error = may_delete(rd->mnt_idmap, new_dir,
5842					   new_dentry, new_is_dir);
5843	}
5844	if (error)
5845		return error;
5846
5847	if (!old_dir->i_op->rename)
5848		return -EPERM;
5849
5850	/*
5851	 * If we are going to change the parent - check write permissions,
5852	 * we'll need to flip '..'.
5853	 */
5854	if (new_dir != old_dir) {
5855		if (is_dir) {
5856			error = inode_permission(rd->mnt_idmap, source,
5857						 MAY_WRITE);
5858			if (error)
5859				return error;
5860		}
5861		if ((flags & RENAME_EXCHANGE) && new_is_dir) {
5862			error = inode_permission(rd->mnt_idmap, target,
5863						 MAY_WRITE);
5864			if (error)
5865				return error;
5866		}
5867	}
5868
5869	error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry,
5870				      flags);
5871	if (error)
5872		return error;
5873
5874	take_dentry_name_snapshot(&old_name, old_dentry);
5875	dget(new_dentry);
5876	/*
5877	 * Lock children.
5878	 * The source subdirectory needs to be locked on cross-directory
5879	 * rename or cross-directory exchange since its parent changes.
5880	 * The target subdirectory needs to be locked on cross-directory
5881	 * exchange due to parent change and on any rename due to becoming
5882	 * a victim.
5883	 * Non-directories need locking in all cases (for NFS reasons);
5884	 * they get locked after any subdirectories (in inode address order).
5885	 *
5886	 * NOTE: WE ONLY LOCK UNRELATED DIRECTORIES IN CROSS-DIRECTORY CASE.
5887	 * NEVER, EVER DO THAT WITHOUT ->s_vfs_rename_mutex.
5888	 */
5889	lock_old_subdir = new_dir != old_dir;
5890	lock_new_subdir = new_dir != old_dir || !(flags & RENAME_EXCHANGE);
5891	if (is_dir) {
5892		if (lock_old_subdir)
5893			inode_lock_nested(source, I_MUTEX_CHILD);
5894		if (target && (!new_is_dir || lock_new_subdir))
5895			inode_lock(target);
5896	} else if (new_is_dir) {
5897		if (lock_new_subdir)
5898			inode_lock_nested(target, I_MUTEX_CHILD);
5899		inode_lock(source);
5900	} else {
5901		lock_two_nondirectories(source, target);
5902	}
5903
5904	error = -EPERM;
5905	if (IS_SWAPFILE(source) || (target && IS_SWAPFILE(target)))
5906		goto out;
5907
5908	error = -EBUSY;
5909	if (is_local_mountpoint(old_dentry) || is_local_mountpoint(new_dentry))
5910		goto out;
5911
5912	if (max_links && new_dir != old_dir) {
5913		error = -EMLINK;
5914		if (is_dir && !new_is_dir && new_dir->i_nlink >= max_links)
5915			goto out;
5916		if ((flags & RENAME_EXCHANGE) && !is_dir && new_is_dir &&
5917		    old_dir->i_nlink >= max_links)
5918			goto out;
5919	}
5920	error = try_break_deleg(old_dir, delegated_inode);
5921	if (error)
5922		goto out;
5923	if (new_dir != old_dir) {
5924		error = try_break_deleg(new_dir, delegated_inode);
5925		if (error)
5926			goto out;
5927	}
5928	if (!is_dir) {
5929		error = try_break_deleg(source, delegated_inode);
5930		if (error)
5931			goto out;
5932	}
5933	if (target && !new_is_dir) {
5934		error = try_break_deleg(target, delegated_inode);
5935		if (error)
5936			goto out;
5937	}
5938	error = old_dir->i_op->rename(rd->mnt_idmap, old_dir, old_dentry,
5939				      new_dir, new_dentry, flags);
5940	if (error)
5941		goto out;
5942
5943	if (!(flags & RENAME_EXCHANGE) && target) {
5944		if (is_dir) {
5945			shrink_dcache_parent(new_dentry);
5946			target->i_flags |= S_DEAD;
5947		}
5948		dont_mount(new_dentry);
5949		detach_mounts(new_dentry);
5950	}
5951	if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) {
5952		if (!(flags & RENAME_EXCHANGE))
5953			d_move(old_dentry, new_dentry);
5954		else
5955			d_exchange(old_dentry, new_dentry);
5956	}
5957out:
5958	if (!is_dir || lock_old_subdir)
5959		inode_unlock(source);
5960	if (target && (!new_is_dir || lock_new_subdir))
5961		inode_unlock(target);
5962	dput(new_dentry);
5963	if (!error) {
5964		fsnotify_move(old_dir, new_dir, &old_name.name, is_dir,
5965			      !(flags & RENAME_EXCHANGE) ? target : NULL, old_dentry);
5966		if (flags & RENAME_EXCHANGE) {
5967			fsnotify_move(new_dir, old_dir, &old_dentry->d_name,
5968				      new_is_dir, NULL, new_dentry);
5969		}
5970	}
5971	release_dentry_name_snapshot(&old_name);
5972
5973	return error;
5974}
5975EXPORT_SYMBOL(vfs_rename);
5976
5977int do_renameat2(int olddfd, struct filename *from, int newdfd,
5978		 struct filename *to, unsigned int flags)
5979{
5980	struct renamedata rd;
5981	struct path old_path, new_path;
5982	struct qstr old_last, new_last;
5983	int old_type, new_type;
5984	struct delegated_inode delegated_inode = { };
5985	unsigned int lookup_flags = 0;
5986	bool should_retry = false;
5987	int error = -EINVAL;
5988
5989	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
5990		goto put_names;
5991
5992	if ((flags & (RENAME_NOREPLACE | RENAME_WHITEOUT)) &&
5993	    (flags & RENAME_EXCHANGE))
5994		goto put_names;
5995
5996retry:
5997	error = filename_parentat(olddfd, from, lookup_flags, &old_path,
5998				  &old_last, &old_type);
5999	if (error)
6000		goto put_names;
6001
6002	error = filename_parentat(newdfd, to, lookup_flags, &new_path, &new_last,
6003				  &new_type);
6004	if (error)
6005		goto exit1;
6006
6007	error = -EXDEV;
6008	if (old_path.mnt != new_path.mnt)
6009		goto exit2;
6010
6011	error = -EBUSY;
6012	if (old_type != LAST_NORM)
6013		goto exit2;
6014
6015	if (flags & RENAME_NOREPLACE)
6016		error = -EEXIST;
6017	if (new_type != LAST_NORM)
6018		goto exit2;
6019
6020	error = mnt_want_write(old_path.mnt);
6021	if (error)
6022		goto exit2;
6023
6024retry_deleg:
6025	rd.old_parent	   = old_path.dentry;
6026	rd.mnt_idmap	   = mnt_idmap(old_path.mnt);
6027	rd.new_parent	   = new_path.dentry;
6028	rd.delegated_inode = &delegated_inode;
6029	rd.flags	   = flags;
6030
6031	error = __start_renaming(&rd, lookup_flags, &old_last, &new_last);
6032	if (error)
6033		goto exit_lock_rename;
6034
6035	if (flags & RENAME_EXCHANGE) {
6036		if (!d_is_dir(rd.new_dentry)) {
6037			error = -ENOTDIR;
6038			if (new_last.name[new_last.len])
6039				goto exit_unlock;
6040		}
6041	}
6042	/* unless the source is a directory trailing slashes give -ENOTDIR */
6043	if (!d_is_dir(rd.old_dentry)) {
6044		error = -ENOTDIR;
6045		if (old_last.name[old_last.len])
6046			goto exit_unlock;
6047		if (!(flags & RENAME_EXCHANGE) && new_last.name[new_last.len])
6048			goto exit_unlock;
6049	}
6050
6051	error = security_path_rename(&old_path, rd.old_dentry,
6052				     &new_path, rd.new_dentry, flags);
6053	if (error)
6054		goto exit_unlock;
6055
6056	error = vfs_rename(&rd);
6057exit_unlock:
6058	end_renaming(&rd);
6059exit_lock_rename:
6060	if (is_delegated(&delegated_inode)) {
6061		error = break_deleg_wait(&delegated_inode);
6062		if (!error)
6063			goto retry_deleg;
6064	}
6065	mnt_drop_write(old_path.mnt);
6066exit2:
6067	if (retry_estale(error, lookup_flags))
6068		should_retry = true;
6069	path_put(&new_path);
6070exit1:
6071	path_put(&old_path);
6072	if (should_retry) {
6073		should_retry = false;
6074		lookup_flags |= LOOKUP_REVAL;
6075		goto retry;
6076	}
6077put_names:
6078	putname(from);
6079	putname(to);
6080	return error;
6081}
6082
6083SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
6084		int, newdfd, const char __user *, newname, unsigned int, flags)
6085{
6086	return do_renameat2(olddfd, getname(oldname), newdfd, getname(newname),
6087				flags);
6088}
6089
6090SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
6091		int, newdfd, const char __user *, newname)
6092{
6093	return do_renameat2(olddfd, getname(oldname), newdfd, getname(newname),
6094				0);
6095}
6096
6097SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname)
6098{
6099	return do_renameat2(AT_FDCWD, getname(oldname), AT_FDCWD,
6100				getname(newname), 0);
6101}
6102
6103int readlink_copy(char __user *buffer, int buflen, const char *link, int linklen)
6104{
6105	int copylen;
6106
6107	copylen = linklen;
6108	if (unlikely(copylen > (unsigned) buflen))
6109		copylen = buflen;
6110	if (copy_to_user(buffer, link, copylen))
6111		copylen = -EFAULT;
6112	return copylen;
6113}
6114
6115/**
6116 * vfs_readlink - copy symlink body into userspace buffer
6117 * @dentry: dentry on which to get symbolic link
6118 * @buffer: user memory pointer
6119 * @buflen: size of buffer
6120 *
6121 * Does not touch atime.  That's up to the caller if necessary
6122 *
6123 * Does not call security hook.
6124 */
6125int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen)
6126{
6127	struct inode *inode = d_inode(dentry);
6128	DEFINE_DELAYED_CALL(done);
6129	const char *link;
6130	int res;
6131
6132	if (inode->i_opflags & IOP_CACHED_LINK)
6133		return readlink_copy(buffer, buflen, inode->i_link, inode->i_linklen);
6134
6135	if (unlikely(!(inode->i_opflags & IOP_DEFAULT_READLINK))) {
6136		if (unlikely(inode->i_op->readlink))
6137			return inode->i_op->readlink(dentry, buffer, buflen);
6138
6139		if (!d_is_symlink(dentry))
6140			return -EINVAL;
6141
6142		spin_lock(&inode->i_lock);
6143		inode->i_opflags |= IOP_DEFAULT_READLINK;
6144		spin_unlock(&inode->i_lock);
6145	}
6146
6147	link = READ_ONCE(inode->i_link);
6148	if (!link) {
6149		link = inode->i_op->get_link(dentry, inode, &done);
6150		if (IS_ERR(link))
6151			return PTR_ERR(link);
6152	}
6153	res = readlink_copy(buffer, buflen, link, strlen(link));
6154	do_delayed_call(&done);
6155	return res;
6156}
6157EXPORT_SYMBOL(vfs_readlink);
6158
6159/**
6160 * vfs_get_link - get symlink body
6161 * @dentry: dentry on which to get symbolic link
6162 * @done: caller needs to free returned data with this
6163 *
6164 * Calls security hook and i_op->get_link() on the supplied inode.
6165 *
6166 * It does not touch atime.  That's up to the caller if necessary.
6167 *
6168 * Does not work on "special" symlinks like /proc/$$/fd/N
6169 */
6170const char *vfs_get_link(struct dentry *dentry, struct delayed_call *done)
6171{
6172	const char *res = ERR_PTR(-EINVAL);
6173	struct inode *inode = d_inode(dentry);
6174
6175	if (d_is_symlink(dentry)) {
6176		res = ERR_PTR(security_inode_readlink(dentry));
6177		if (!res)
6178			res = inode->i_op->get_link(dentry, inode, done);
6179	}
6180	return res;
6181}
6182EXPORT_SYMBOL(vfs_get_link);
6183
6184/* get the link contents into pagecache */
6185static char *__page_get_link(struct dentry *dentry, struct inode *inode,
6186			     struct delayed_call *callback)
6187{
6188	struct folio *folio;
6189	struct address_space *mapping = inode->i_mapping;
6190
6191	if (!dentry) {
6192		folio = filemap_get_folio(mapping, 0);
6193		if (IS_ERR(folio))
6194			return ERR_PTR(-ECHILD);
6195		if (!folio_test_uptodate(folio)) {
6196			folio_put(folio);
6197			return ERR_PTR(-ECHILD);
6198		}
6199	} else {
6200		folio = read_mapping_folio(mapping, 0, NULL);
6201		if (IS_ERR(folio))
6202			return ERR_CAST(folio);
6203	}
6204	set_delayed_call(callback, page_put_link, folio);
6205	BUG_ON(mapping_gfp_mask(mapping) & __GFP_HIGHMEM);
6206	return folio_address(folio);
6207}
6208
6209const char *page_get_link_raw(struct dentry *dentry, struct inode *inode,
6210			      struct delayed_call *callback)
6211{
6212	return __page_get_link(dentry, inode, callback);
6213}
6214EXPORT_SYMBOL_GPL(page_get_link_raw);
6215
6216/**
6217 * page_get_link() - An implementation of the get_link inode_operation.
6218 * @dentry: The directory entry which is the symlink.
6219 * @inode: The inode for the symlink.
6220 * @callback: Used to drop the reference to the symlink.
6221 *
6222 * Filesystems which store their symlinks in the page cache should use
6223 * this to implement the get_link() member of their inode_operations.
6224 *
6225 * Return: A pointer to the NUL-terminated symlink.
6226 */
6227const char *page_get_link(struct dentry *dentry, struct inode *inode,
6228					struct delayed_call *callback)
6229{
6230	char *kaddr = __page_get_link(dentry, inode, callback);
6231
6232	if (!IS_ERR(kaddr))
6233		nd_terminate_link(kaddr, inode->i_size, PAGE_SIZE - 1);
6234	return kaddr;
6235}
6236EXPORT_SYMBOL(page_get_link);
6237
6238/**
6239 * page_put_link() - Drop the reference to the symlink.
6240 * @arg: The folio which contains the symlink.
6241 *
6242 * This is used internally by page_get_link().  It is exported for use
6243 * by filesystems which need to implement a variant of page_get_link()
6244 * themselves.  Despite the apparent symmetry, filesystems which use
6245 * page_get_link() do not need to call page_put_link().
6246 *
6247 * The argument, while it has a void pointer type, must be a pointer to
6248 * the folio which was retrieved from the page cache.  The delayed_call
6249 * infrastructure is used to drop the reference count once the caller
6250 * is done with the symlink.
6251 */
6252void page_put_link(void *arg)
6253{
6254	folio_put(arg);
6255}
6256EXPORT_SYMBOL(page_put_link);
6257
6258int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
6259{
6260	const char *link;
6261	int res;
6262
6263	DEFINE_DELAYED_CALL(done);
6264	link = page_get_link(dentry, d_inode(dentry), &done);
6265	res = PTR_ERR(link);
6266	if (!IS_ERR(link))
6267		res = readlink_copy(buffer, buflen, link, strlen(link));
6268	do_delayed_call(&done);
6269	return res;
6270}
6271EXPORT_SYMBOL(page_readlink);
6272
6273int page_symlink(struct inode *inode, const char *symname, int len)
6274{
6275	struct address_space *mapping = inode->i_mapping;
6276	const struct address_space_operations *aops = mapping->a_ops;
6277	bool nofs = !mapping_gfp_constraint(mapping, __GFP_FS);
6278	struct folio *folio;
6279	void *fsdata = NULL;
6280	int err;
6281	unsigned int flags;
6282
6283retry:
6284	if (nofs)
6285		flags = memalloc_nofs_save();
6286	err = aops->write_begin(NULL, mapping, 0, len-1, &folio, &fsdata);
6287	if (nofs)
6288		memalloc_nofs_restore(flags);
6289	if (err)
6290		goto fail;
6291
6292	memcpy(folio_address(folio), symname, len - 1);
6293
6294	err = aops->write_end(NULL, mapping, 0, len - 1, len - 1,
6295						folio, fsdata);
6296	if (err < 0)
6297		goto fail;
6298	if (err < len-1)
6299		goto retry;
6300
6301	mark_inode_dirty(inode);
6302	return 0;
6303fail:
6304	return err;
6305}
6306EXPORT_SYMBOL(page_symlink);
6307
6308const struct inode_operations page_symlink_inode_operations = {
6309	.get_link	= page_get_link,
6310};
6311EXPORT_SYMBOL(page_symlink_inode_operations);