Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * linux/fs/namei.c
4 *
5 * Copyright (C) 1991, 1992 Linus Torvalds
6 */
7
8/*
9 * Some corrections by tytso.
10 */
11
12/* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname
13 * lookup logic.
14 */
15/* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture.
16 */
17
18#include <linux/init.h>
19#include <linux/export.h>
20#include <linux/slab.h>
21#include <linux/wordpart.h>
22#include <linux/fs.h>
23#include <linux/filelock.h>
24#include <linux/namei.h>
25#include <linux/pagemap.h>
26#include <linux/sched/mm.h>
27#include <linux/fsnotify.h>
28#include <linux/personality.h>
29#include <linux/security.h>
30#include <linux/syscalls.h>
31#include <linux/mount.h>
32#include <linux/audit.h>
33#include <linux/capability.h>
34#include <linux/file.h>
35#include <linux/fcntl.h>
36#include <linux/device_cgroup.h>
37#include <linux/fs_struct.h>
38#include <linux/posix_acl.h>
39#include <linux/hash.h>
40#include <linux/bitops.h>
41#include <linux/init_task.h>
42#include <linux/uaccess.h>
43
44#include "internal.h"
45#include "mount.h"
46
47/* [Feb-1997 T. Schoebel-Theuer]
48 * Fundamental changes in the pathname lookup mechanisms (namei)
49 * were necessary because of omirr. The reason is that omirr needs
50 * to know the _real_ pathname, not the user-supplied one, in case
51 * of symlinks (and also when transname replacements occur).
52 *
53 * The new code replaces the old recursive symlink resolution with
54 * an iterative one (in case of non-nested symlink chains). It does
55 * this with calls to <fs>_follow_link().
56 * As a side effect, dir_namei(), _namei() and follow_link() are now
57 * replaced with a single function lookup_dentry() that can handle all
58 * the special cases of the former code.
59 *
60 * With the new dcache, the pathname is stored at each inode, at least as
61 * long as the refcount of the inode is positive. As a side effect, the
62 * size of the dcache depends on the inode cache and thus is dynamic.
63 *
64 * [29-Apr-1998 C. Scott Ananian] Updated above description of symlink
65 * resolution to correspond with current state of the code.
66 *
67 * Note that the symlink resolution is not *completely* iterative.
68 * There is still a significant amount of tail- and mid- recursion in
69 * the algorithm. Also, note that <fs>_readlink() is not used in
70 * lookup_dentry(): lookup_dentry() on the result of <fs>_readlink()
71 * may return different results than <fs>_follow_link(). Many virtual
72 * filesystems (including /proc) exhibit this behavior.
73 */
74
75/* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation:
76 * New symlink semantics: when open() is called with flags O_CREAT | O_EXCL
77 * and the name already exists in form of a symlink, try to create the new
78 * name indicated by the symlink. The old code always complained that the
79 * name already exists, due to not following the symlink even if its target
80 * is nonexistent. The new semantics affects also mknod() and link() when
81 * the name is a symlink pointing to a non-existent name.
82 *
83 * I don't know which semantics is the right one, since I have no access
84 * to standards. But I found by trial that HP-UX 9.0 has the full "new"
85 * semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the
86 * "old" one. Personally, I think the new semantics is much more logical.
87 * Note that "ln old new" where "new" is a symlink pointing to a non-existing
88 * file does succeed in both HP-UX and SunOs, but not in Solaris
89 * and in the old Linux semantics.
90 */
91
92/* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink
93 * semantics. See the comments in "open_namei" and "do_link" below.
94 *
95 * [10-Sep-98 Alan Modra] Another symlink change.
96 */
97
98/* [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks:
99 * inside the path - always follow.
100 * in the last component in creation/removal/renaming - never follow.
101 * if LOOKUP_FOLLOW passed - follow.
102 * if the pathname has trailing slashes - follow.
103 * otherwise - don't follow.
104 * (applied in that order).
105 *
106 * [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT
107 * restored for 2.4. This is the last surviving part of old 4.2BSD bug.
108 * During the 2.4 we need to fix the userland stuff depending on it -
109 * hopefully we will be able to get rid of that wart in 2.5. So far only
110 * XEmacs seems to be relying on it...
111 */
112/*
113 * [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland)
114 * implemented. Let's see if raised priority of ->s_vfs_rename_mutex gives
115 * any extra contention...
116 */
117
118/* In order to reduce some races, while at the same time doing additional
119 * checking and hopefully speeding things up, we copy filenames to the
120 * kernel data space before using them..
121 *
122 * POSIX.1 2.4: an empty pathname is invalid (ENOENT).
123 * PATH_MAX includes the nul terminator --RR.
124 */
125
126#define EMBEDDED_NAME_MAX (PATH_MAX - offsetof(struct filename, iname))
127
128static inline void initname(struct filename *name, const char __user *uptr)
129{
130 name->uptr = uptr;
131 name->aname = NULL;
132 atomic_set(&name->refcnt, 1);
133}
134
135struct filename *
136getname_flags(const char __user *filename, int flags)
137{
138 struct filename *result;
139 char *kname;
140 int len;
141
142 result = audit_reusename(filename);
143 if (result)
144 return result;
145
146 result = __getname();
147 if (unlikely(!result))
148 return ERR_PTR(-ENOMEM);
149
150 /*
151 * First, try to embed the struct filename inside the names_cache
152 * allocation
153 */
154 kname = (char *)result->iname;
155 result->name = kname;
156
157 len = strncpy_from_user(kname, filename, EMBEDDED_NAME_MAX);
158 /*
159 * Handle both empty path and copy failure in one go.
160 */
161 if (unlikely(len <= 0)) {
162 if (unlikely(len < 0)) {
163 __putname(result);
164 return ERR_PTR(len);
165 }
166
167 /* The empty path is special. */
168 if (!(flags & LOOKUP_EMPTY)) {
169 __putname(result);
170 return ERR_PTR(-ENOENT);
171 }
172 }
173
174 /*
175 * Uh-oh. We have a name that's approaching PATH_MAX. Allocate a
176 * separate struct filename so we can dedicate the entire
177 * names_cache allocation for the pathname, and re-do the copy from
178 * userland.
179 */
180 if (unlikely(len == EMBEDDED_NAME_MAX)) {
181 const size_t size = offsetof(struct filename, iname[1]);
182 kname = (char *)result;
183
184 /*
185 * size is chosen that way we to guarantee that
186 * result->iname[0] is within the same object and that
187 * kname can't be equal to result->iname, no matter what.
188 */
189 result = kzalloc(size, GFP_KERNEL);
190 if (unlikely(!result)) {
191 __putname(kname);
192 return ERR_PTR(-ENOMEM);
193 }
194 result->name = kname;
195 len = strncpy_from_user(kname, filename, PATH_MAX);
196 if (unlikely(len < 0)) {
197 __putname(kname);
198 kfree(result);
199 return ERR_PTR(len);
200 }
201 /* The empty path is special. */
202 if (unlikely(!len) && !(flags & LOOKUP_EMPTY)) {
203 __putname(kname);
204 kfree(result);
205 return ERR_PTR(-ENOENT);
206 }
207 if (unlikely(len == PATH_MAX)) {
208 __putname(kname);
209 kfree(result);
210 return ERR_PTR(-ENAMETOOLONG);
211 }
212 }
213 initname(result, filename);
214 audit_getname(result);
215 return result;
216}
217
218struct filename *getname_uflags(const char __user *filename, int uflags)
219{
220 int flags = (uflags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0;
221
222 return getname_flags(filename, flags);
223}
224
225struct filename *__getname_maybe_null(const char __user *pathname)
226{
227 struct filename *name;
228 char c;
229
230 /* try to save on allocations; loss on um, though */
231 if (get_user(c, pathname))
232 return ERR_PTR(-EFAULT);
233 if (!c)
234 return NULL;
235
236 name = getname_flags(pathname, LOOKUP_EMPTY);
237 if (!IS_ERR(name) && !(name->name[0])) {
238 putname(name);
239 name = NULL;
240 }
241 return name;
242}
243
244struct filename *getname_kernel(const char * filename)
245{
246 struct filename *result;
247 int len = strlen(filename) + 1;
248
249 result = __getname();
250 if (unlikely(!result))
251 return ERR_PTR(-ENOMEM);
252
253 if (len <= EMBEDDED_NAME_MAX) {
254 result->name = (char *)result->iname;
255 } else if (len <= PATH_MAX) {
256 const size_t size = offsetof(struct filename, iname[1]);
257 struct filename *tmp;
258
259 tmp = kmalloc(size, GFP_KERNEL);
260 if (unlikely(!tmp)) {
261 __putname(result);
262 return ERR_PTR(-ENOMEM);
263 }
264 tmp->name = (char *)result;
265 result = tmp;
266 } else {
267 __putname(result);
268 return ERR_PTR(-ENAMETOOLONG);
269 }
270 memcpy((char *)result->name, filename, len);
271 initname(result, NULL);
272 audit_getname(result);
273 return result;
274}
275EXPORT_SYMBOL(getname_kernel);
276
277void putname(struct filename *name)
278{
279 int refcnt;
280
281 if (IS_ERR_OR_NULL(name))
282 return;
283
284 refcnt = atomic_read(&name->refcnt);
285 if (unlikely(refcnt != 1)) {
286 if (WARN_ON_ONCE(!refcnt))
287 return;
288
289 if (!atomic_dec_and_test(&name->refcnt))
290 return;
291 }
292
293 if (unlikely(name->name != name->iname)) {
294 __putname(name->name);
295 kfree(name);
296 } else
297 __putname(name);
298}
299EXPORT_SYMBOL(putname);
300
301/**
302 * check_acl - perform ACL permission checking
303 * @idmap: idmap of the mount the inode was found from
304 * @inode: inode to check permissions on
305 * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
306 *
307 * This function performs the ACL permission checking. Since this function
308 * retrieve POSIX acls it needs to know whether it is called from a blocking or
309 * non-blocking context and thus cares about the MAY_NOT_BLOCK bit.
310 *
311 * If the inode has been found through an idmapped mount the idmap of
312 * the vfsmount must be passed through @idmap. This function will then take
313 * care to map the inode according to @idmap before checking permissions.
314 * On non-idmapped mounts or if permission checking is to be performed on the
315 * raw inode simply pass @nop_mnt_idmap.
316 */
317static int check_acl(struct mnt_idmap *idmap,
318 struct inode *inode, int mask)
319{
320#ifdef CONFIG_FS_POSIX_ACL
321 struct posix_acl *acl;
322
323 if (mask & MAY_NOT_BLOCK) {
324 acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS);
325 if (!acl)
326 return -EAGAIN;
327 /* no ->get_inode_acl() calls in RCU mode... */
328 if (is_uncached_acl(acl))
329 return -ECHILD;
330 return posix_acl_permission(idmap, inode, acl, mask);
331 }
332
333 acl = get_inode_acl(inode, ACL_TYPE_ACCESS);
334 if (IS_ERR(acl))
335 return PTR_ERR(acl);
336 if (acl) {
337 int error = posix_acl_permission(idmap, inode, acl, mask);
338 posix_acl_release(acl);
339 return error;
340 }
341#endif
342
343 return -EAGAIN;
344}
345
346/*
347 * Very quick optimistic "we know we have no ACL's" check.
348 *
349 * Note that this is purely for ACL_TYPE_ACCESS, and purely
350 * for the "we have cached that there are no ACLs" case.
351 *
352 * If this returns true, we know there are no ACLs. But if
353 * it returns false, we might still not have ACLs (it could
354 * be the is_uncached_acl() case).
355 */
356static inline bool no_acl_inode(struct inode *inode)
357{
358#ifdef CONFIG_FS_POSIX_ACL
359 return likely(!READ_ONCE(inode->i_acl));
360#else
361 return true;
362#endif
363}
364
365/**
366 * acl_permission_check - perform basic UNIX permission checking
367 * @idmap: idmap of the mount the inode was found from
368 * @inode: inode to check permissions on
369 * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
370 *
371 * This function performs the basic UNIX permission checking. Since this
372 * function may retrieve POSIX acls it needs to know whether it is called from a
373 * blocking or non-blocking context and thus cares about the MAY_NOT_BLOCK bit.
374 *
375 * If the inode has been found through an idmapped mount the idmap of
376 * the vfsmount must be passed through @idmap. This function will then take
377 * care to map the inode according to @idmap before checking permissions.
378 * On non-idmapped mounts or if permission checking is to be performed on the
379 * raw inode simply pass @nop_mnt_idmap.
380 */
381static int acl_permission_check(struct mnt_idmap *idmap,
382 struct inode *inode, int mask)
383{
384 unsigned int mode = inode->i_mode;
385 vfsuid_t vfsuid;
386
387 /*
388 * Common cheap case: everybody has the requested
389 * rights, and there are no ACLs to check. No need
390 * to do any owner/group checks in that case.
391 *
392 * - 'mask&7' is the requested permission bit set
393 * - multiplying by 0111 spreads them out to all of ugo
394 * - '& ~mode' looks for missing inode permission bits
395 * - the '!' is for "no missing permissions"
396 *
397 * After that, we just need to check that there are no
398 * ACL's on the inode - do the 'IS_POSIXACL()' check last
399 * because it will dereference the ->i_sb pointer and we
400 * want to avoid that if at all possible.
401 */
402 if (!((mask & 7) * 0111 & ~mode)) {
403 if (no_acl_inode(inode))
404 return 0;
405 if (!IS_POSIXACL(inode))
406 return 0;
407 }
408
409 /* Are we the owner? If so, ACL's don't matter */
410 vfsuid = i_uid_into_vfsuid(idmap, inode);
411 if (likely(vfsuid_eq_kuid(vfsuid, current_fsuid()))) {
412 mask &= 7;
413 mode >>= 6;
414 return (mask & ~mode) ? -EACCES : 0;
415 }
416
417 /* Do we have ACL's? */
418 if (IS_POSIXACL(inode) && (mode & S_IRWXG)) {
419 int error = check_acl(idmap, inode, mask);
420 if (error != -EAGAIN)
421 return error;
422 }
423
424 /* Only RWX matters for group/other mode bits */
425 mask &= 7;
426
427 /*
428 * Are the group permissions different from
429 * the other permissions in the bits we care
430 * about? Need to check group ownership if so.
431 */
432 if (mask & (mode ^ (mode >> 3))) {
433 vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode);
434 if (vfsgid_in_group_p(vfsgid))
435 mode >>= 3;
436 }
437
438 /* Bits in 'mode' clear that we require? */
439 return (mask & ~mode) ? -EACCES : 0;
440}
441
442/**
443 * generic_permission - check for access rights on a Posix-like filesystem
444 * @idmap: idmap of the mount the inode was found from
445 * @inode: inode to check access rights for
446 * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC,
447 * %MAY_NOT_BLOCK ...)
448 *
449 * Used to check for read/write/execute permissions on a file.
450 * We use "fsuid" for this, letting us set arbitrary permissions
451 * for filesystem access without changing the "normal" uids which
452 * are used for other things.
453 *
454 * generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk
455 * request cannot be satisfied (eg. requires blocking or too much complexity).
456 * It would then be called again in ref-walk mode.
457 *
458 * If the inode has been found through an idmapped mount the idmap of
459 * the vfsmount must be passed through @idmap. This function will then take
460 * care to map the inode according to @idmap before checking permissions.
461 * On non-idmapped mounts or if permission checking is to be performed on the
462 * raw inode simply pass @nop_mnt_idmap.
463 */
464int generic_permission(struct mnt_idmap *idmap, struct inode *inode,
465 int mask)
466{
467 int ret;
468
469 /*
470 * Do the basic permission checks.
471 */
472 ret = acl_permission_check(idmap, inode, mask);
473 if (ret != -EACCES)
474 return ret;
475
476 if (S_ISDIR(inode->i_mode)) {
477 /* DACs are overridable for directories */
478 if (!(mask & MAY_WRITE))
479 if (capable_wrt_inode_uidgid(idmap, inode,
480 CAP_DAC_READ_SEARCH))
481 return 0;
482 if (capable_wrt_inode_uidgid(idmap, inode,
483 CAP_DAC_OVERRIDE))
484 return 0;
485 return -EACCES;
486 }
487
488 /*
489 * Searching includes executable on directories, else just read.
490 */
491 mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
492 if (mask == MAY_READ)
493 if (capable_wrt_inode_uidgid(idmap, inode,
494 CAP_DAC_READ_SEARCH))
495 return 0;
496 /*
497 * Read/write DACs are always overridable.
498 * Executable DACs are overridable when there is
499 * at least one exec bit set.
500 */
501 if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO))
502 if (capable_wrt_inode_uidgid(idmap, inode,
503 CAP_DAC_OVERRIDE))
504 return 0;
505
506 return -EACCES;
507}
508EXPORT_SYMBOL(generic_permission);
509
510/**
511 * do_inode_permission - UNIX permission checking
512 * @idmap: idmap of the mount the inode was found from
513 * @inode: inode to check permissions on
514 * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
515 *
516 * We _really_ want to just do "generic_permission()" without
517 * even looking at the inode->i_op values. So we keep a cache
518 * flag in inode->i_opflags, that says "this has not special
519 * permission function, use the fast case".
520 */
521static inline int do_inode_permission(struct mnt_idmap *idmap,
522 struct inode *inode, int mask)
523{
524 if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) {
525 if (likely(inode->i_op->permission))
526 return inode->i_op->permission(idmap, inode, mask);
527
528 /* This gets set once for the inode lifetime */
529 spin_lock(&inode->i_lock);
530 inode->i_opflags |= IOP_FASTPERM;
531 spin_unlock(&inode->i_lock);
532 }
533 return generic_permission(idmap, inode, mask);
534}
535
536/**
537 * sb_permission - Check superblock-level permissions
538 * @sb: Superblock of inode to check permission on
539 * @inode: Inode to check permission on
540 * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
541 *
542 * Separate out file-system wide checks from inode-specific permission checks.
543 *
544 * Note: lookup_inode_permission_may_exec() does not call here. If you add
545 * MAY_EXEC checks, adjust it.
546 */
547static int sb_permission(struct super_block *sb, struct inode *inode, int mask)
548{
549 if (mask & MAY_WRITE) {
550 umode_t mode = inode->i_mode;
551
552 /* Nobody gets write access to a read-only fs. */
553 if (sb_rdonly(sb) && (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
554 return -EROFS;
555 }
556 return 0;
557}
558
559/**
560 * inode_permission - Check for access rights to a given inode
561 * @idmap: idmap of the mount the inode was found from
562 * @inode: Inode to check permission on
563 * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
564 *
565 * Check for read/write/execute permissions on an inode. We use fs[ug]id for
566 * this, letting us set arbitrary permissions for filesystem access without
567 * changing the "normal" UIDs which are used for other things.
568 *
569 * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
570 */
571int inode_permission(struct mnt_idmap *idmap,
572 struct inode *inode, int mask)
573{
574 int retval;
575
576 retval = sb_permission(inode->i_sb, inode, mask);
577 if (unlikely(retval))
578 return retval;
579
580 if (mask & MAY_WRITE) {
581 /*
582 * Nobody gets write access to an immutable file.
583 */
584 if (unlikely(IS_IMMUTABLE(inode)))
585 return -EPERM;
586
587 /*
588 * Updating mtime will likely cause i_uid and i_gid to be
589 * written back improperly if their true value is unknown
590 * to the vfs.
591 */
592 if (unlikely(HAS_UNMAPPED_ID(idmap, inode)))
593 return -EACCES;
594 }
595
596 retval = do_inode_permission(idmap, inode, mask);
597 if (unlikely(retval))
598 return retval;
599
600 retval = devcgroup_inode_permission(inode, mask);
601 if (unlikely(retval))
602 return retval;
603
604 return security_inode_permission(inode, mask);
605}
606EXPORT_SYMBOL(inode_permission);
607
608/*
609 * lookup_inode_permission_may_exec - Check traversal right for given inode
610 *
611 * This is a special case routine for may_lookup() making assumptions specific
612 * to path traversal. Use inode_permission() if you are doing something else.
613 *
614 * Work is shaved off compared to inode_permission() as follows:
615 * - we know for a fact there is no MAY_WRITE to worry about
616 * - it is an invariant the inode is a directory
617 *
618 * Since majority of real-world traversal happens on inodes which grant it for
619 * everyone, we check it upfront and only resort to more expensive work if it
620 * fails.
621 *
622 * Filesystems which have their own ->permission hook and consequently miss out
623 * on IOP_FASTPERM can still get the optimization if they set IOP_FASTPERM_MAY_EXEC
624 * on their directory inodes.
625 */
626static __always_inline int lookup_inode_permission_may_exec(struct mnt_idmap *idmap,
627 struct inode *inode, int mask)
628{
629 /* Lookup already checked this to return -ENOTDIR */
630 VFS_BUG_ON_INODE(!S_ISDIR(inode->i_mode), inode);
631 VFS_BUG_ON((mask & ~MAY_NOT_BLOCK) != 0);
632
633 mask |= MAY_EXEC;
634
635 if (unlikely(!(inode->i_opflags & (IOP_FASTPERM | IOP_FASTPERM_MAY_EXEC))))
636 return inode_permission(idmap, inode, mask);
637
638 if (unlikely(((inode->i_mode & 0111) != 0111) || !no_acl_inode(inode)))
639 return inode_permission(idmap, inode, mask);
640
641 return security_inode_permission(inode, mask);
642}
643
644/**
645 * path_get - get a reference to a path
646 * @path: path to get the reference to
647 *
648 * Given a path increment the reference count to the dentry and the vfsmount.
649 */
650void path_get(const struct path *path)
651{
652 mntget(path->mnt);
653 dget(path->dentry);
654}
655EXPORT_SYMBOL(path_get);
656
657/**
658 * path_put - put a reference to a path
659 * @path: path to put the reference to
660 *
661 * Given a path decrement the reference count to the dentry and the vfsmount.
662 */
663void path_put(const struct path *path)
664{
665 dput(path->dentry);
666 mntput(path->mnt);
667}
668EXPORT_SYMBOL(path_put);
669
670#define EMBEDDED_LEVELS 2
671struct nameidata {
672 struct path path;
673 struct qstr last;
674 struct path root;
675 struct inode *inode; /* path.dentry.d_inode */
676 unsigned int flags, state;
677 unsigned seq, next_seq, m_seq, r_seq;
678 int last_type;
679 unsigned depth;
680 int total_link_count;
681 struct saved {
682 struct path link;
683 struct delayed_call done;
684 const char *name;
685 unsigned seq;
686 } *stack, internal[EMBEDDED_LEVELS];
687 struct filename *name;
688 const char *pathname;
689 struct nameidata *saved;
690 unsigned root_seq;
691 int dfd;
692 vfsuid_t dir_vfsuid;
693 umode_t dir_mode;
694} __randomize_layout;
695
696#define ND_ROOT_PRESET 1
697#define ND_ROOT_GRABBED 2
698#define ND_JUMPED 4
699
700static void __set_nameidata(struct nameidata *p, int dfd, struct filename *name)
701{
702 struct nameidata *old = current->nameidata;
703 p->stack = p->internal;
704 p->depth = 0;
705 p->dfd = dfd;
706 p->name = name;
707 p->pathname = likely(name) ? name->name : "";
708 p->path.mnt = NULL;
709 p->path.dentry = NULL;
710 p->total_link_count = old ? old->total_link_count : 0;
711 p->saved = old;
712 current->nameidata = p;
713}
714
715static inline void set_nameidata(struct nameidata *p, int dfd, struct filename *name,
716 const struct path *root)
717{
718 __set_nameidata(p, dfd, name);
719 p->state = 0;
720 if (unlikely(root)) {
721 p->state = ND_ROOT_PRESET;
722 p->root = *root;
723 }
724}
725
726static void restore_nameidata(void)
727{
728 struct nameidata *now = current->nameidata, *old = now->saved;
729
730 current->nameidata = old;
731 if (old)
732 old->total_link_count = now->total_link_count;
733 if (now->stack != now->internal)
734 kfree(now->stack);
735}
736
737static bool nd_alloc_stack(struct nameidata *nd)
738{
739 struct saved *p;
740
741 p= kmalloc_array(MAXSYMLINKS, sizeof(struct saved),
742 nd->flags & LOOKUP_RCU ? GFP_ATOMIC : GFP_KERNEL);
743 if (unlikely(!p))
744 return false;
745 memcpy(p, nd->internal, sizeof(nd->internal));
746 nd->stack = p;
747 return true;
748}
749
750/**
751 * path_connected - Verify that a dentry is below mnt.mnt_root
752 * @mnt: The mountpoint to check.
753 * @dentry: The dentry to check.
754 *
755 * Rename can sometimes move a file or directory outside of a bind
756 * mount, path_connected allows those cases to be detected.
757 */
758static bool path_connected(struct vfsmount *mnt, struct dentry *dentry)
759{
760 struct super_block *sb = mnt->mnt_sb;
761
762 /* Bind mounts can have disconnected paths */
763 if (mnt->mnt_root == sb->s_root)
764 return true;
765
766 return is_subdir(dentry, mnt->mnt_root);
767}
768
769static void drop_links(struct nameidata *nd)
770{
771 int i = nd->depth;
772 while (i--) {
773 struct saved *last = nd->stack + i;
774 do_delayed_call(&last->done);
775 clear_delayed_call(&last->done);
776 }
777}
778
779static void leave_rcu(struct nameidata *nd)
780{
781 nd->flags &= ~LOOKUP_RCU;
782 nd->seq = nd->next_seq = 0;
783 rcu_read_unlock();
784}
785
786static void terminate_walk(struct nameidata *nd)
787{
788 if (unlikely(nd->depth))
789 drop_links(nd);
790 if (!(nd->flags & LOOKUP_RCU)) {
791 int i;
792 path_put(&nd->path);
793 for (i = 0; i < nd->depth; i++)
794 path_put(&nd->stack[i].link);
795 if (nd->state & ND_ROOT_GRABBED) {
796 path_put(&nd->root);
797 nd->state &= ~ND_ROOT_GRABBED;
798 }
799 } else {
800 leave_rcu(nd);
801 }
802 nd->depth = 0;
803 nd->path.mnt = NULL;
804 nd->path.dentry = NULL;
805}
806
807/* path_put is needed afterwards regardless of success or failure */
808static bool __legitimize_path(struct path *path, unsigned seq, unsigned mseq)
809{
810 int res = __legitimize_mnt(path->mnt, mseq);
811 if (unlikely(res)) {
812 if (res > 0)
813 path->mnt = NULL;
814 path->dentry = NULL;
815 return false;
816 }
817 if (unlikely(!lockref_get_not_dead(&path->dentry->d_lockref))) {
818 path->dentry = NULL;
819 return false;
820 }
821 return !read_seqcount_retry(&path->dentry->d_seq, seq);
822}
823
824static inline bool legitimize_path(struct nameidata *nd,
825 struct path *path, unsigned seq)
826{
827 return __legitimize_path(path, seq, nd->m_seq);
828}
829
830static bool legitimize_links(struct nameidata *nd)
831{
832 int i;
833
834 VFS_BUG_ON(nd->flags & LOOKUP_CACHED);
835
836 for (i = 0; i < nd->depth; i++) {
837 struct saved *last = nd->stack + i;
838 if (unlikely(!legitimize_path(nd, &last->link, last->seq))) {
839 drop_links(nd);
840 nd->depth = i + 1;
841 return false;
842 }
843 }
844 return true;
845}
846
847static bool legitimize_root(struct nameidata *nd)
848{
849 /* Nothing to do if nd->root is zero or is managed by the VFS user. */
850 if (!nd->root.mnt || (nd->state & ND_ROOT_PRESET))
851 return true;
852 nd->state |= ND_ROOT_GRABBED;
853 return legitimize_path(nd, &nd->root, nd->root_seq);
854}
855
856/*
857 * Path walking has 2 modes, rcu-walk and ref-walk (see
858 * Documentation/filesystems/path-lookup.txt). In situations when we can't
859 * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab
860 * normal reference counts on dentries and vfsmounts to transition to ref-walk
861 * mode. Refcounts are grabbed at the last known good point before rcu-walk
862 * got stuck, so ref-walk may continue from there. If this is not successful
863 * (eg. a seqcount has changed), then failure is returned and it's up to caller
864 * to restart the path walk from the beginning in ref-walk mode.
865 */
866
867/**
868 * try_to_unlazy - try to switch to ref-walk mode.
869 * @nd: nameidata pathwalk data
870 * Returns: true on success, false on failure
871 *
872 * try_to_unlazy attempts to legitimize the current nd->path and nd->root
873 * for ref-walk mode.
874 * Must be called from rcu-walk context.
875 * Nothing should touch nameidata between try_to_unlazy() failure and
876 * terminate_walk().
877 */
878static bool try_to_unlazy(struct nameidata *nd)
879{
880 struct dentry *parent = nd->path.dentry;
881
882 BUG_ON(!(nd->flags & LOOKUP_RCU));
883
884 if (unlikely(nd->flags & LOOKUP_CACHED)) {
885 drop_links(nd);
886 nd->depth = 0;
887 goto out1;
888 }
889 if (unlikely(nd->depth && !legitimize_links(nd)))
890 goto out1;
891 if (unlikely(!legitimize_path(nd, &nd->path, nd->seq)))
892 goto out;
893 if (unlikely(!legitimize_root(nd)))
894 goto out;
895 leave_rcu(nd);
896 BUG_ON(nd->inode != parent->d_inode);
897 return true;
898
899out1:
900 nd->path.mnt = NULL;
901 nd->path.dentry = NULL;
902out:
903 leave_rcu(nd);
904 return false;
905}
906
907/**
908 * try_to_unlazy_next - try to switch to ref-walk mode.
909 * @nd: nameidata pathwalk data
910 * @dentry: next dentry to step into
911 * Returns: true on success, false on failure
912 *
913 * Similar to try_to_unlazy(), but here we have the next dentry already
914 * picked by rcu-walk and want to legitimize that in addition to the current
915 * nd->path and nd->root for ref-walk mode. Must be called from rcu-walk context.
916 * Nothing should touch nameidata between try_to_unlazy_next() failure and
917 * terminate_walk().
918 */
919static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry)
920{
921 int res;
922 BUG_ON(!(nd->flags & LOOKUP_RCU));
923
924 if (unlikely(nd->flags & LOOKUP_CACHED)) {
925 drop_links(nd);
926 nd->depth = 0;
927 goto out2;
928 }
929 if (unlikely(nd->depth && !legitimize_links(nd)))
930 goto out2;
931 res = __legitimize_mnt(nd->path.mnt, nd->m_seq);
932 if (unlikely(res)) {
933 if (res > 0)
934 goto out2;
935 goto out1;
936 }
937 if (unlikely(!lockref_get_not_dead(&nd->path.dentry->d_lockref)))
938 goto out1;
939
940 /*
941 * We need to move both the parent and the dentry from the RCU domain
942 * to be properly refcounted. And the sequence number in the dentry
943 * validates *both* dentry counters, since we checked the sequence
944 * number of the parent after we got the child sequence number. So we
945 * know the parent must still be valid if the child sequence number is
946 */
947 if (unlikely(!lockref_get_not_dead(&dentry->d_lockref)))
948 goto out;
949 if (read_seqcount_retry(&dentry->d_seq, nd->next_seq))
950 goto out_dput;
951 /*
952 * Sequence counts matched. Now make sure that the root is
953 * still valid and get it if required.
954 */
955 if (unlikely(!legitimize_root(nd)))
956 goto out_dput;
957 leave_rcu(nd);
958 return true;
959
960out2:
961 nd->path.mnt = NULL;
962out1:
963 nd->path.dentry = NULL;
964out:
965 leave_rcu(nd);
966 return false;
967out_dput:
968 leave_rcu(nd);
969 dput(dentry);
970 return false;
971}
972
973static inline int d_revalidate(struct inode *dir, const struct qstr *name,
974 struct dentry *dentry, unsigned int flags)
975{
976 if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE))
977 return dentry->d_op->d_revalidate(dir, name, dentry, flags);
978 else
979 return 1;
980}
981
982/**
983 * complete_walk - successful completion of path walk
984 * @nd: pointer nameidata
985 *
986 * If we had been in RCU mode, drop out of it and legitimize nd->path.
987 * Revalidate the final result, unless we'd already done that during
988 * the path walk or the filesystem doesn't ask for it. Return 0 on
989 * success, -error on failure. In case of failure caller does not
990 * need to drop nd->path.
991 */
992static int complete_walk(struct nameidata *nd)
993{
994 struct dentry *dentry = nd->path.dentry;
995 int status;
996
997 if (nd->flags & LOOKUP_RCU) {
998 /*
999 * We don't want to zero nd->root for scoped-lookups or
1000 * externally-managed nd->root.
1001 */
1002 if (likely(!(nd->state & ND_ROOT_PRESET)))
1003 if (likely(!(nd->flags & LOOKUP_IS_SCOPED)))
1004 nd->root.mnt = NULL;
1005 nd->flags &= ~LOOKUP_CACHED;
1006 if (!try_to_unlazy(nd))
1007 return -ECHILD;
1008 }
1009
1010 if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) {
1011 /*
1012 * While the guarantee of LOOKUP_IS_SCOPED is (roughly) "don't
1013 * ever step outside the root during lookup" and should already
1014 * be guaranteed by the rest of namei, we want to avoid a namei
1015 * BUG resulting in userspace being given a path that was not
1016 * scoped within the root at some point during the lookup.
1017 *
1018 * So, do a final sanity-check to make sure that in the
1019 * worst-case scenario (a complete bypass of LOOKUP_IS_SCOPED)
1020 * we won't silently return an fd completely outside of the
1021 * requested root to userspace.
1022 *
1023 * Userspace could move the path outside the root after this
1024 * check, but as discussed elsewhere this is not a concern (the
1025 * resolved file was inside the root at some point).
1026 */
1027 if (!path_is_under(&nd->path, &nd->root))
1028 return -EXDEV;
1029 }
1030
1031 if (likely(!(nd->state & ND_JUMPED)))
1032 return 0;
1033
1034 if (likely(!(dentry->d_flags & DCACHE_OP_WEAK_REVALIDATE)))
1035 return 0;
1036
1037 status = dentry->d_op->d_weak_revalidate(dentry, nd->flags);
1038 if (status > 0)
1039 return 0;
1040
1041 if (!status)
1042 status = -ESTALE;
1043
1044 return status;
1045}
1046
1047static int set_root(struct nameidata *nd)
1048{
1049 struct fs_struct *fs = current->fs;
1050
1051 /*
1052 * Jumping to the real root in a scoped-lookup is a BUG in namei, but we
1053 * still have to ensure it doesn't happen because it will cause a breakout
1054 * from the dirfd.
1055 */
1056 if (WARN_ON(nd->flags & LOOKUP_IS_SCOPED))
1057 return -ENOTRECOVERABLE;
1058
1059 if (nd->flags & LOOKUP_RCU) {
1060 unsigned seq;
1061
1062 do {
1063 seq = read_seqbegin(&fs->seq);
1064 nd->root = fs->root;
1065 nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
1066 } while (read_seqretry(&fs->seq, seq));
1067 } else {
1068 get_fs_root(fs, &nd->root);
1069 nd->state |= ND_ROOT_GRABBED;
1070 }
1071 return 0;
1072}
1073
1074static int nd_jump_root(struct nameidata *nd)
1075{
1076 if (unlikely(nd->flags & LOOKUP_BENEATH))
1077 return -EXDEV;
1078 if (unlikely(nd->flags & LOOKUP_NO_XDEV)) {
1079 /* Absolute path arguments to path_init() are allowed. */
1080 if (nd->path.mnt != NULL && nd->path.mnt != nd->root.mnt)
1081 return -EXDEV;
1082 }
1083 if (!nd->root.mnt) {
1084 int error = set_root(nd);
1085 if (unlikely(error))
1086 return error;
1087 }
1088 if (nd->flags & LOOKUP_RCU) {
1089 struct dentry *d;
1090 nd->path = nd->root;
1091 d = nd->path.dentry;
1092 nd->inode = d->d_inode;
1093 nd->seq = nd->root_seq;
1094 if (read_seqcount_retry(&d->d_seq, nd->seq))
1095 return -ECHILD;
1096 } else {
1097 path_put(&nd->path);
1098 nd->path = nd->root;
1099 path_get(&nd->path);
1100 nd->inode = nd->path.dentry->d_inode;
1101 }
1102 nd->state |= ND_JUMPED;
1103 return 0;
1104}
1105
1106/*
1107 * Helper to directly jump to a known parsed path from ->get_link,
1108 * caller must have taken a reference to path beforehand.
1109 */
1110int nd_jump_link(const struct path *path)
1111{
1112 int error = -ELOOP;
1113 struct nameidata *nd = current->nameidata;
1114
1115 if (unlikely(nd->flags & LOOKUP_NO_MAGICLINKS))
1116 goto err;
1117
1118 error = -EXDEV;
1119 if (unlikely(nd->flags & LOOKUP_NO_XDEV)) {
1120 if (nd->path.mnt != path->mnt)
1121 goto err;
1122 }
1123 /* Not currently safe for scoped-lookups. */
1124 if (unlikely(nd->flags & LOOKUP_IS_SCOPED))
1125 goto err;
1126
1127 path_put(&nd->path);
1128 nd->path = *path;
1129 nd->inode = nd->path.dentry->d_inode;
1130 nd->state |= ND_JUMPED;
1131 return 0;
1132
1133err:
1134 path_put(path);
1135 return error;
1136}
1137
1138static inline void put_link(struct nameidata *nd)
1139{
1140 struct saved *last = nd->stack + --nd->depth;
1141 do_delayed_call(&last->done);
1142 if (!(nd->flags & LOOKUP_RCU))
1143 path_put(&last->link);
1144}
1145
1146static int sysctl_protected_symlinks __read_mostly;
1147static int sysctl_protected_hardlinks __read_mostly;
1148static int sysctl_protected_fifos __read_mostly;
1149static int sysctl_protected_regular __read_mostly;
1150
1151#ifdef CONFIG_SYSCTL
1152static const struct ctl_table namei_sysctls[] = {
1153 {
1154 .procname = "protected_symlinks",
1155 .data = &sysctl_protected_symlinks,
1156 .maxlen = sizeof(int),
1157 .mode = 0644,
1158 .proc_handler = proc_dointvec_minmax,
1159 .extra1 = SYSCTL_ZERO,
1160 .extra2 = SYSCTL_ONE,
1161 },
1162 {
1163 .procname = "protected_hardlinks",
1164 .data = &sysctl_protected_hardlinks,
1165 .maxlen = sizeof(int),
1166 .mode = 0644,
1167 .proc_handler = proc_dointvec_minmax,
1168 .extra1 = SYSCTL_ZERO,
1169 .extra2 = SYSCTL_ONE,
1170 },
1171 {
1172 .procname = "protected_fifos",
1173 .data = &sysctl_protected_fifos,
1174 .maxlen = sizeof(int),
1175 .mode = 0644,
1176 .proc_handler = proc_dointvec_minmax,
1177 .extra1 = SYSCTL_ZERO,
1178 .extra2 = SYSCTL_TWO,
1179 },
1180 {
1181 .procname = "protected_regular",
1182 .data = &sysctl_protected_regular,
1183 .maxlen = sizeof(int),
1184 .mode = 0644,
1185 .proc_handler = proc_dointvec_minmax,
1186 .extra1 = SYSCTL_ZERO,
1187 .extra2 = SYSCTL_TWO,
1188 },
1189};
1190
1191static int __init init_fs_namei_sysctls(void)
1192{
1193 register_sysctl_init("fs", namei_sysctls);
1194 return 0;
1195}
1196fs_initcall(init_fs_namei_sysctls);
1197
1198#endif /* CONFIG_SYSCTL */
1199
1200/**
1201 * may_follow_link - Check symlink following for unsafe situations
1202 * @nd: nameidata pathwalk data
1203 * @inode: Used for idmapping.
1204 *
1205 * In the case of the sysctl_protected_symlinks sysctl being enabled,
1206 * CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is
1207 * in a sticky world-writable directory. This is to protect privileged
1208 * processes from failing races against path names that may change out
1209 * from under them by way of other users creating malicious symlinks.
1210 * It will permit symlinks to be followed only when outside a sticky
1211 * world-writable directory, or when the uid of the symlink and follower
1212 * match, or when the directory owner matches the symlink's owner.
1213 *
1214 * Returns 0 if following the symlink is allowed, -ve on error.
1215 */
1216static inline int may_follow_link(struct nameidata *nd, const struct inode *inode)
1217{
1218 struct mnt_idmap *idmap;
1219 vfsuid_t vfsuid;
1220
1221 if (!sysctl_protected_symlinks)
1222 return 0;
1223
1224 idmap = mnt_idmap(nd->path.mnt);
1225 vfsuid = i_uid_into_vfsuid(idmap, inode);
1226 /* Allowed if owner and follower match. */
1227 if (vfsuid_eq_kuid(vfsuid, current_fsuid()))
1228 return 0;
1229
1230 /* Allowed if parent directory not sticky and world-writable. */
1231 if ((nd->dir_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH))
1232 return 0;
1233
1234 /* Allowed if parent directory and link owner match. */
1235 if (vfsuid_valid(nd->dir_vfsuid) && vfsuid_eq(nd->dir_vfsuid, vfsuid))
1236 return 0;
1237
1238 if (nd->flags & LOOKUP_RCU)
1239 return -ECHILD;
1240
1241 audit_inode(nd->name, nd->stack[0].link.dentry, 0);
1242 audit_log_path_denied(AUDIT_ANOM_LINK, "follow_link");
1243 return -EACCES;
1244}
1245
1246/**
1247 * safe_hardlink_source - Check for safe hardlink conditions
1248 * @idmap: idmap of the mount the inode was found from
1249 * @inode: the source inode to hardlink from
1250 *
1251 * Return false if at least one of the following conditions:
1252 * - inode is not a regular file
1253 * - inode is setuid
1254 * - inode is setgid and group-exec
1255 * - access failure for read and write
1256 *
1257 * Otherwise returns true.
1258 */
1259static bool safe_hardlink_source(struct mnt_idmap *idmap,
1260 struct inode *inode)
1261{
1262 umode_t mode = inode->i_mode;
1263
1264 /* Special files should not get pinned to the filesystem. */
1265 if (!S_ISREG(mode))
1266 return false;
1267
1268 /* Setuid files should not get pinned to the filesystem. */
1269 if (mode & S_ISUID)
1270 return false;
1271
1272 /* Executable setgid files should not get pinned to the filesystem. */
1273 if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
1274 return false;
1275
1276 /* Hardlinking to unreadable or unwritable sources is dangerous. */
1277 if (inode_permission(idmap, inode, MAY_READ | MAY_WRITE))
1278 return false;
1279
1280 return true;
1281}
1282
1283/**
1284 * may_linkat - Check permissions for creating a hardlink
1285 * @idmap: idmap of the mount the inode was found from
1286 * @link: the source to hardlink from
1287 *
1288 * Block hardlink when all of:
1289 * - sysctl_protected_hardlinks enabled
1290 * - fsuid does not match inode
1291 * - hardlink source is unsafe (see safe_hardlink_source() above)
1292 * - not CAP_FOWNER in a namespace with the inode owner uid mapped
1293 *
1294 * If the inode has been found through an idmapped mount the idmap of
1295 * the vfsmount must be passed through @idmap. This function will then take
1296 * care to map the inode according to @idmap before checking permissions.
1297 * On non-idmapped mounts or if permission checking is to be performed on the
1298 * raw inode simply pass @nop_mnt_idmap.
1299 *
1300 * Returns 0 if successful, -ve on error.
1301 */
1302int may_linkat(struct mnt_idmap *idmap, const struct path *link)
1303{
1304 struct inode *inode = link->dentry->d_inode;
1305
1306 /* Inode writeback is not safe when the uid or gid are invalid. */
1307 if (!vfsuid_valid(i_uid_into_vfsuid(idmap, inode)) ||
1308 !vfsgid_valid(i_gid_into_vfsgid(idmap, inode)))
1309 return -EOVERFLOW;
1310
1311 if (!sysctl_protected_hardlinks)
1312 return 0;
1313
1314 /* Source inode owner (or CAP_FOWNER) can hardlink all they like,
1315 * otherwise, it must be a safe source.
1316 */
1317 if (safe_hardlink_source(idmap, inode) ||
1318 inode_owner_or_capable(idmap, inode))
1319 return 0;
1320
1321 audit_log_path_denied(AUDIT_ANOM_LINK, "linkat");
1322 return -EPERM;
1323}
1324
1325/**
1326 * may_create_in_sticky - Check whether an O_CREAT open in a sticky directory
1327 * should be allowed, or not, on files that already
1328 * exist.
1329 * @idmap: idmap of the mount the inode was found from
1330 * @nd: nameidata pathwalk data
1331 * @inode: the inode of the file to open
1332 *
1333 * Block an O_CREAT open of a FIFO (or a regular file) when:
1334 * - sysctl_protected_fifos (or sysctl_protected_regular) is enabled
1335 * - the file already exists
1336 * - we are in a sticky directory
1337 * - we don't own the file
1338 * - the owner of the directory doesn't own the file
1339 * - the directory is world writable
1340 * If the sysctl_protected_fifos (or sysctl_protected_regular) is set to 2
1341 * the directory doesn't have to be world writable: being group writable will
1342 * be enough.
1343 *
1344 * If the inode has been found through an idmapped mount the idmap of
1345 * the vfsmount must be passed through @idmap. This function will then take
1346 * care to map the inode according to @idmap before checking permissions.
1347 * On non-idmapped mounts or if permission checking is to be performed on the
1348 * raw inode simply pass @nop_mnt_idmap.
1349 *
1350 * Returns 0 if the open is allowed, -ve on error.
1351 */
1352static int may_create_in_sticky(struct mnt_idmap *idmap, struct nameidata *nd,
1353 struct inode *const inode)
1354{
1355 umode_t dir_mode = nd->dir_mode;
1356 vfsuid_t dir_vfsuid = nd->dir_vfsuid, i_vfsuid;
1357
1358 if (likely(!(dir_mode & S_ISVTX)))
1359 return 0;
1360
1361 if (S_ISREG(inode->i_mode) && !sysctl_protected_regular)
1362 return 0;
1363
1364 if (S_ISFIFO(inode->i_mode) && !sysctl_protected_fifos)
1365 return 0;
1366
1367 i_vfsuid = i_uid_into_vfsuid(idmap, inode);
1368
1369 if (vfsuid_eq(i_vfsuid, dir_vfsuid))
1370 return 0;
1371
1372 if (vfsuid_eq_kuid(i_vfsuid, current_fsuid()))
1373 return 0;
1374
1375 if (likely(dir_mode & 0002)) {
1376 audit_log_path_denied(AUDIT_ANOM_CREAT, "sticky_create");
1377 return -EACCES;
1378 }
1379
1380 if (dir_mode & 0020) {
1381 if (sysctl_protected_fifos >= 2 && S_ISFIFO(inode->i_mode)) {
1382 audit_log_path_denied(AUDIT_ANOM_CREAT,
1383 "sticky_create_fifo");
1384 return -EACCES;
1385 }
1386
1387 if (sysctl_protected_regular >= 2 && S_ISREG(inode->i_mode)) {
1388 audit_log_path_denied(AUDIT_ANOM_CREAT,
1389 "sticky_create_regular");
1390 return -EACCES;
1391 }
1392 }
1393
1394 return 0;
1395}
1396
1397/*
1398 * follow_up - Find the mountpoint of path's vfsmount
1399 *
1400 * Given a path, find the mountpoint of its source file system.
1401 * Replace @path with the path of the mountpoint in the parent mount.
1402 * Up is towards /.
1403 *
1404 * Return 1 if we went up a level and 0 if we were already at the
1405 * root.
1406 */
1407int follow_up(struct path *path)
1408{
1409 struct mount *mnt = real_mount(path->mnt);
1410 struct mount *parent;
1411 struct dentry *mountpoint;
1412
1413 read_seqlock_excl(&mount_lock);
1414 parent = mnt->mnt_parent;
1415 if (parent == mnt) {
1416 read_sequnlock_excl(&mount_lock);
1417 return 0;
1418 }
1419 mntget(&parent->mnt);
1420 mountpoint = dget(mnt->mnt_mountpoint);
1421 read_sequnlock_excl(&mount_lock);
1422 dput(path->dentry);
1423 path->dentry = mountpoint;
1424 mntput(path->mnt);
1425 path->mnt = &parent->mnt;
1426 return 1;
1427}
1428EXPORT_SYMBOL(follow_up);
1429
1430static bool choose_mountpoint_rcu(struct mount *m, const struct path *root,
1431 struct path *path, unsigned *seqp)
1432{
1433 while (mnt_has_parent(m)) {
1434 struct dentry *mountpoint = m->mnt_mountpoint;
1435
1436 m = m->mnt_parent;
1437 if (unlikely(root->dentry == mountpoint &&
1438 root->mnt == &m->mnt))
1439 break;
1440 if (mountpoint != m->mnt.mnt_root) {
1441 path->mnt = &m->mnt;
1442 path->dentry = mountpoint;
1443 *seqp = read_seqcount_begin(&mountpoint->d_seq);
1444 return true;
1445 }
1446 }
1447 return false;
1448}
1449
1450static bool choose_mountpoint(struct mount *m, const struct path *root,
1451 struct path *path)
1452{
1453 bool found;
1454
1455 rcu_read_lock();
1456 while (1) {
1457 unsigned seq, mseq = read_seqbegin(&mount_lock);
1458
1459 found = choose_mountpoint_rcu(m, root, path, &seq);
1460 if (unlikely(!found)) {
1461 if (!read_seqretry(&mount_lock, mseq))
1462 break;
1463 } else {
1464 if (likely(__legitimize_path(path, seq, mseq)))
1465 break;
1466 rcu_read_unlock();
1467 path_put(path);
1468 rcu_read_lock();
1469 }
1470 }
1471 rcu_read_unlock();
1472 return found;
1473}
1474
1475/*
1476 * Perform an automount
1477 * - return -EISDIR to tell follow_managed() to stop and return the path we
1478 * were called with.
1479 */
1480static int follow_automount(struct path *path, int *count, unsigned lookup_flags)
1481{
1482 struct dentry *dentry = path->dentry;
1483
1484 /* We don't want to mount if someone's just doing a stat -
1485 * unless they're stat'ing a directory and appended a '/' to
1486 * the name.
1487 *
1488 * We do, however, want to mount if someone wants to open or
1489 * create a file of any type under the mountpoint, wants to
1490 * traverse through the mountpoint or wants to open the
1491 * mounted directory. Also, autofs may mark negative dentries
1492 * as being automount points. These will need the attentions
1493 * of the daemon to instantiate them before they can be used.
1494 */
1495 if (!(lookup_flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
1496 LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) &&
1497 dentry->d_inode)
1498 return -EISDIR;
1499
1500 /* No need to trigger automounts if mountpoint crossing is disabled. */
1501 if (lookup_flags & LOOKUP_NO_XDEV)
1502 return -EXDEV;
1503
1504 if (count && (*count)++ >= MAXSYMLINKS)
1505 return -ELOOP;
1506
1507 return finish_automount(dentry->d_op->d_automount(path), path);
1508}
1509
1510/*
1511 * mount traversal - out-of-line part. One note on ->d_flags accesses -
1512 * dentries are pinned but not locked here, so negative dentry can go
1513 * positive right under us. Use of smp_load_acquire() provides a barrier
1514 * sufficient for ->d_inode and ->d_flags consistency.
1515 */
1516static int __traverse_mounts(struct path *path, unsigned flags, bool *jumped,
1517 int *count, unsigned lookup_flags)
1518{
1519 struct vfsmount *mnt = path->mnt;
1520 bool need_mntput = false;
1521 int ret = 0;
1522
1523 while (flags & DCACHE_MANAGED_DENTRY) {
1524 /* Allow the filesystem to manage the transit without i_rwsem
1525 * being held. */
1526 if (flags & DCACHE_MANAGE_TRANSIT) {
1527 if (lookup_flags & LOOKUP_NO_XDEV) {
1528 ret = -EXDEV;
1529 break;
1530 }
1531 ret = path->dentry->d_op->d_manage(path, false);
1532 flags = smp_load_acquire(&path->dentry->d_flags);
1533 if (ret < 0)
1534 break;
1535 }
1536
1537 if (flags & DCACHE_MOUNTED) { // something's mounted on it..
1538 struct vfsmount *mounted = lookup_mnt(path);
1539 if (mounted) { // ... in our namespace
1540 dput(path->dentry);
1541 if (need_mntput)
1542 mntput(path->mnt);
1543 path->mnt = mounted;
1544 path->dentry = dget(mounted->mnt_root);
1545 // here we know it's positive
1546 flags = path->dentry->d_flags;
1547 need_mntput = true;
1548 if (unlikely(lookup_flags & LOOKUP_NO_XDEV)) {
1549 ret = -EXDEV;
1550 break;
1551 }
1552 continue;
1553 }
1554 }
1555
1556 if (!(flags & DCACHE_NEED_AUTOMOUNT))
1557 break;
1558
1559 // uncovered automount point
1560 ret = follow_automount(path, count, lookup_flags);
1561 flags = smp_load_acquire(&path->dentry->d_flags);
1562 if (ret < 0)
1563 break;
1564 }
1565
1566 if (ret == -EISDIR)
1567 ret = 0;
1568 // possible if you race with several mount --move
1569 if (need_mntput && path->mnt == mnt)
1570 mntput(path->mnt);
1571 if (!ret && unlikely(d_flags_negative(flags)))
1572 ret = -ENOENT;
1573 *jumped = need_mntput;
1574 return ret;
1575}
1576
1577static inline int traverse_mounts(struct path *path, bool *jumped,
1578 int *count, unsigned lookup_flags)
1579{
1580 unsigned flags = smp_load_acquire(&path->dentry->d_flags);
1581
1582 /* fastpath */
1583 if (likely(!(flags & DCACHE_MANAGED_DENTRY))) {
1584 *jumped = false;
1585 if (unlikely(d_flags_negative(flags)))
1586 return -ENOENT;
1587 return 0;
1588 }
1589 return __traverse_mounts(path, flags, jumped, count, lookup_flags);
1590}
1591
1592int follow_down_one(struct path *path)
1593{
1594 struct vfsmount *mounted;
1595
1596 mounted = lookup_mnt(path);
1597 if (mounted) {
1598 dput(path->dentry);
1599 mntput(path->mnt);
1600 path->mnt = mounted;
1601 path->dentry = dget(mounted->mnt_root);
1602 return 1;
1603 }
1604 return 0;
1605}
1606EXPORT_SYMBOL(follow_down_one);
1607
1608/*
1609 * Follow down to the covering mount currently visible to userspace. At each
1610 * point, the filesystem owning that dentry may be queried as to whether the
1611 * caller is permitted to proceed or not.
1612 */
1613int follow_down(struct path *path, unsigned int flags)
1614{
1615 struct vfsmount *mnt = path->mnt;
1616 bool jumped;
1617 int ret = traverse_mounts(path, &jumped, NULL, flags);
1618
1619 if (path->mnt != mnt)
1620 mntput(mnt);
1621 return ret;
1622}
1623EXPORT_SYMBOL(follow_down);
1624
1625/*
1626 * Try to skip to top of mountpoint pile in rcuwalk mode. Fail if
1627 * we meet a managed dentry that would need blocking.
1628 */
1629static bool __follow_mount_rcu(struct nameidata *nd, struct path *path)
1630{
1631 struct dentry *dentry = path->dentry;
1632 unsigned int flags = dentry->d_flags;
1633
1634 if (likely(!(flags & DCACHE_MANAGED_DENTRY)))
1635 return true;
1636
1637 if (unlikely(nd->flags & LOOKUP_NO_XDEV))
1638 return false;
1639
1640 for (;;) {
1641 /*
1642 * Don't forget we might have a non-mountpoint managed dentry
1643 * that wants to block transit.
1644 */
1645 if (unlikely(flags & DCACHE_MANAGE_TRANSIT)) {
1646 int res = dentry->d_op->d_manage(path, true);
1647 if (res)
1648 return res == -EISDIR;
1649 flags = dentry->d_flags;
1650 }
1651
1652 if (flags & DCACHE_MOUNTED) {
1653 struct mount *mounted = __lookup_mnt(path->mnt, dentry);
1654 if (mounted) {
1655 path->mnt = &mounted->mnt;
1656 dentry = path->dentry = mounted->mnt.mnt_root;
1657 nd->state |= ND_JUMPED;
1658 nd->next_seq = read_seqcount_begin(&dentry->d_seq);
1659 flags = dentry->d_flags;
1660 // makes sure that non-RCU pathwalk could reach
1661 // this state.
1662 if (read_seqretry(&mount_lock, nd->m_seq))
1663 return false;
1664 continue;
1665 }
1666 if (read_seqretry(&mount_lock, nd->m_seq))
1667 return false;
1668 }
1669 return !(flags & DCACHE_NEED_AUTOMOUNT);
1670 }
1671}
1672
1673static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry,
1674 struct path *path)
1675{
1676 bool jumped;
1677 int ret;
1678
1679 path->mnt = nd->path.mnt;
1680 path->dentry = dentry;
1681 if (nd->flags & LOOKUP_RCU) {
1682 unsigned int seq = nd->next_seq;
1683 if (likely(!d_managed(dentry)))
1684 return 0;
1685 if (likely(__follow_mount_rcu(nd, path)))
1686 return 0;
1687 // *path and nd->next_seq might've been clobbered
1688 path->mnt = nd->path.mnt;
1689 path->dentry = dentry;
1690 nd->next_seq = seq;
1691 if (unlikely(!try_to_unlazy_next(nd, dentry)))
1692 return -ECHILD;
1693 }
1694 ret = traverse_mounts(path, &jumped, &nd->total_link_count, nd->flags);
1695 if (jumped)
1696 nd->state |= ND_JUMPED;
1697 if (unlikely(ret)) {
1698 dput(path->dentry);
1699 if (path->mnt != nd->path.mnt)
1700 mntput(path->mnt);
1701 }
1702 return ret;
1703}
1704
1705/*
1706 * This looks up the name in dcache and possibly revalidates the found dentry.
1707 * NULL is returned if the dentry does not exist in the cache.
1708 */
1709static struct dentry *lookup_dcache(const struct qstr *name,
1710 struct dentry *dir,
1711 unsigned int flags)
1712{
1713 struct dentry *dentry = d_lookup(dir, name);
1714 if (dentry) {
1715 int error = d_revalidate(dir->d_inode, name, dentry, flags);
1716 if (unlikely(error <= 0)) {
1717 if (!error)
1718 d_invalidate(dentry);
1719 dput(dentry);
1720 return ERR_PTR(error);
1721 }
1722 }
1723 return dentry;
1724}
1725
1726/*
1727 * Parent directory has inode locked exclusive. This is one
1728 * and only case when ->lookup() gets called on non in-lookup
1729 * dentries - as the matter of fact, this only gets called
1730 * when directory is guaranteed to have no in-lookup children
1731 * at all.
1732 * Will return -ENOENT if name isn't found and LOOKUP_CREATE wasn't passed.
1733 * Will return -EEXIST if name is found and LOOKUP_EXCL was passed.
1734 */
1735struct dentry *lookup_one_qstr_excl(const struct qstr *name,
1736 struct dentry *base, unsigned int flags)
1737{
1738 struct dentry *dentry;
1739 struct dentry *old;
1740 struct inode *dir;
1741
1742 dentry = lookup_dcache(name, base, flags);
1743 if (dentry)
1744 goto found;
1745
1746 /* Don't create child dentry for a dead directory. */
1747 dir = base->d_inode;
1748 if (unlikely(IS_DEADDIR(dir)))
1749 return ERR_PTR(-ENOENT);
1750
1751 dentry = d_alloc(base, name);
1752 if (unlikely(!dentry))
1753 return ERR_PTR(-ENOMEM);
1754
1755 old = dir->i_op->lookup(dir, dentry, flags);
1756 if (unlikely(old)) {
1757 dput(dentry);
1758 dentry = old;
1759 }
1760found:
1761 if (IS_ERR(dentry))
1762 return dentry;
1763 if (d_is_negative(dentry) && !(flags & LOOKUP_CREATE)) {
1764 dput(dentry);
1765 return ERR_PTR(-ENOENT);
1766 }
1767 if (d_is_positive(dentry) && (flags & LOOKUP_EXCL)) {
1768 dput(dentry);
1769 return ERR_PTR(-EEXIST);
1770 }
1771 return dentry;
1772}
1773EXPORT_SYMBOL(lookup_one_qstr_excl);
1774
1775/**
1776 * lookup_fast - do fast lockless (but racy) lookup of a dentry
1777 * @nd: current nameidata
1778 *
1779 * Do a fast, but racy lookup in the dcache for the given dentry, and
1780 * revalidate it. Returns a valid dentry pointer or NULL if one wasn't
1781 * found. On error, an ERR_PTR will be returned.
1782 *
1783 * If this function returns a valid dentry and the walk is no longer
1784 * lazy, the dentry will carry a reference that must later be put. If
1785 * RCU mode is still in force, then this is not the case and the dentry
1786 * must be legitimized before use. If this returns NULL, then the walk
1787 * will no longer be in RCU mode.
1788 */
1789static struct dentry *lookup_fast(struct nameidata *nd)
1790{
1791 struct dentry *dentry, *parent = nd->path.dentry;
1792 int status = 1;
1793
1794 /*
1795 * Rename seqlock is not required here because in the off chance
1796 * of a false negative due to a concurrent rename, the caller is
1797 * going to fall back to non-racy lookup.
1798 */
1799 if (nd->flags & LOOKUP_RCU) {
1800 dentry = __d_lookup_rcu(parent, &nd->last, &nd->next_seq);
1801 if (unlikely(!dentry)) {
1802 if (!try_to_unlazy(nd))
1803 return ERR_PTR(-ECHILD);
1804 return NULL;
1805 }
1806
1807 /*
1808 * This sequence count validates that the parent had no
1809 * changes while we did the lookup of the dentry above.
1810 */
1811 if (read_seqcount_retry(&parent->d_seq, nd->seq))
1812 return ERR_PTR(-ECHILD);
1813
1814 status = d_revalidate(nd->inode, &nd->last, dentry, nd->flags);
1815 if (likely(status > 0))
1816 return dentry;
1817 if (!try_to_unlazy_next(nd, dentry))
1818 return ERR_PTR(-ECHILD);
1819 if (status == -ECHILD)
1820 /* we'd been told to redo it in non-rcu mode */
1821 status = d_revalidate(nd->inode, &nd->last,
1822 dentry, nd->flags);
1823 } else {
1824 dentry = __d_lookup(parent, &nd->last);
1825 if (unlikely(!dentry))
1826 return NULL;
1827 status = d_revalidate(nd->inode, &nd->last, dentry, nd->flags);
1828 }
1829 if (unlikely(status <= 0)) {
1830 if (!status)
1831 d_invalidate(dentry);
1832 dput(dentry);
1833 return ERR_PTR(status);
1834 }
1835 return dentry;
1836}
1837
1838/* Fast lookup failed, do it the slow way */
1839static struct dentry *__lookup_slow(const struct qstr *name,
1840 struct dentry *dir,
1841 unsigned int flags)
1842{
1843 struct dentry *dentry, *old;
1844 struct inode *inode = dir->d_inode;
1845 DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
1846
1847 /* Don't go there if it's already dead */
1848 if (unlikely(IS_DEADDIR(inode)))
1849 return ERR_PTR(-ENOENT);
1850again:
1851 dentry = d_alloc_parallel(dir, name, &wq);
1852 if (IS_ERR(dentry))
1853 return dentry;
1854 if (unlikely(!d_in_lookup(dentry))) {
1855 int error = d_revalidate(inode, name, dentry, flags);
1856 if (unlikely(error <= 0)) {
1857 if (!error) {
1858 d_invalidate(dentry);
1859 dput(dentry);
1860 goto again;
1861 }
1862 dput(dentry);
1863 dentry = ERR_PTR(error);
1864 }
1865 } else {
1866 old = inode->i_op->lookup(inode, dentry, flags);
1867 d_lookup_done(dentry);
1868 if (unlikely(old)) {
1869 dput(dentry);
1870 dentry = old;
1871 }
1872 }
1873 return dentry;
1874}
1875
1876static noinline struct dentry *lookup_slow(const struct qstr *name,
1877 struct dentry *dir,
1878 unsigned int flags)
1879{
1880 struct inode *inode = dir->d_inode;
1881 struct dentry *res;
1882 inode_lock_shared(inode);
1883 res = __lookup_slow(name, dir, flags);
1884 inode_unlock_shared(inode);
1885 return res;
1886}
1887
1888static struct dentry *lookup_slow_killable(const struct qstr *name,
1889 struct dentry *dir,
1890 unsigned int flags)
1891{
1892 struct inode *inode = dir->d_inode;
1893 struct dentry *res;
1894
1895 if (inode_lock_shared_killable(inode))
1896 return ERR_PTR(-EINTR);
1897 res = __lookup_slow(name, dir, flags);
1898 inode_unlock_shared(inode);
1899 return res;
1900}
1901
1902static inline int may_lookup(struct mnt_idmap *idmap,
1903 struct nameidata *restrict nd)
1904{
1905 int err, mask;
1906
1907 mask = nd->flags & LOOKUP_RCU ? MAY_NOT_BLOCK : 0;
1908 err = lookup_inode_permission_may_exec(idmap, nd->inode, mask);
1909 if (likely(!err))
1910 return 0;
1911
1912 // If we failed, and we weren't in LOOKUP_RCU, it's final
1913 if (!(nd->flags & LOOKUP_RCU))
1914 return err;
1915
1916 // Drop out of RCU mode to make sure it wasn't transient
1917 if (!try_to_unlazy(nd))
1918 return -ECHILD; // redo it all non-lazy
1919
1920 if (err != -ECHILD) // hard error
1921 return err;
1922
1923 return lookup_inode_permission_may_exec(idmap, nd->inode, 0);
1924}
1925
1926static int reserve_stack(struct nameidata *nd, struct path *link)
1927{
1928 if (unlikely(nd->total_link_count++ >= MAXSYMLINKS))
1929 return -ELOOP;
1930
1931 if (likely(nd->depth != EMBEDDED_LEVELS))
1932 return 0;
1933 if (likely(nd->stack != nd->internal))
1934 return 0;
1935 if (likely(nd_alloc_stack(nd)))
1936 return 0;
1937
1938 if (nd->flags & LOOKUP_RCU) {
1939 // we need to grab link before we do unlazy. And we can't skip
1940 // unlazy even if we fail to grab the link - cleanup needs it
1941 bool grabbed_link = legitimize_path(nd, link, nd->next_seq);
1942
1943 if (!try_to_unlazy(nd) || !grabbed_link)
1944 return -ECHILD;
1945
1946 if (nd_alloc_stack(nd))
1947 return 0;
1948 }
1949 return -ENOMEM;
1950}
1951
1952enum {WALK_TRAILING = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4};
1953
1954static noinline const char *pick_link(struct nameidata *nd, struct path *link,
1955 struct inode *inode, int flags)
1956{
1957 struct saved *last;
1958 const char *res;
1959 int error;
1960
1961 if (nd->flags & LOOKUP_RCU) {
1962 /* make sure that d_is_symlink from step_into_slowpath() matches the inode */
1963 if (read_seqcount_retry(&link->dentry->d_seq, nd->next_seq))
1964 return ERR_PTR(-ECHILD);
1965 } else {
1966 if (link->mnt == nd->path.mnt)
1967 mntget(link->mnt);
1968 }
1969
1970 error = reserve_stack(nd, link);
1971 if (unlikely(error)) {
1972 if (!(nd->flags & LOOKUP_RCU))
1973 path_put(link);
1974 return ERR_PTR(error);
1975 }
1976 last = nd->stack + nd->depth++;
1977 last->link = *link;
1978 clear_delayed_call(&last->done);
1979 last->seq = nd->next_seq;
1980
1981 if (flags & WALK_TRAILING) {
1982 error = may_follow_link(nd, inode);
1983 if (unlikely(error))
1984 return ERR_PTR(error);
1985 }
1986
1987 if (unlikely(nd->flags & LOOKUP_NO_SYMLINKS) ||
1988 unlikely(link->mnt->mnt_flags & MNT_NOSYMFOLLOW))
1989 return ERR_PTR(-ELOOP);
1990
1991 if (unlikely(atime_needs_update(&last->link, inode))) {
1992 if (nd->flags & LOOKUP_RCU) {
1993 if (!try_to_unlazy(nd))
1994 return ERR_PTR(-ECHILD);
1995 }
1996 touch_atime(&last->link);
1997 cond_resched();
1998 }
1999
2000 error = security_inode_follow_link(link->dentry, inode,
2001 nd->flags & LOOKUP_RCU);
2002 if (unlikely(error))
2003 return ERR_PTR(error);
2004
2005 res = READ_ONCE(inode->i_link);
2006 if (!res) {
2007 const char * (*get)(struct dentry *, struct inode *,
2008 struct delayed_call *);
2009 get = inode->i_op->get_link;
2010 if (nd->flags & LOOKUP_RCU) {
2011 res = get(NULL, inode, &last->done);
2012 if (res == ERR_PTR(-ECHILD) && try_to_unlazy(nd))
2013 res = get(link->dentry, inode, &last->done);
2014 } else {
2015 res = get(link->dentry, inode, &last->done);
2016 }
2017 if (!res)
2018 goto all_done;
2019 if (IS_ERR(res))
2020 return res;
2021 }
2022 if (*res == '/') {
2023 error = nd_jump_root(nd);
2024 if (unlikely(error))
2025 return ERR_PTR(error);
2026 while (unlikely(*++res == '/'))
2027 ;
2028 }
2029 if (*res)
2030 return res;
2031all_done: // pure jump
2032 put_link(nd);
2033 return NULL;
2034}
2035
2036/*
2037 * Do we need to follow links? We _really_ want to be able
2038 * to do this check without having to look at inode->i_op,
2039 * so we keep a cache of "no, this doesn't need follow_link"
2040 * for the common case.
2041 *
2042 * NOTE: dentry must be what nd->next_seq had been sampled from.
2043 */
2044static noinline const char *step_into_slowpath(struct nameidata *nd, int flags,
2045 struct dentry *dentry)
2046{
2047 struct path path;
2048 struct inode *inode;
2049 int err;
2050
2051 err = handle_mounts(nd, dentry, &path);
2052 if (unlikely(err < 0))
2053 return ERR_PTR(err);
2054 inode = path.dentry->d_inode;
2055 if (likely(!d_is_symlink(path.dentry)) ||
2056 ((flags & WALK_TRAILING) && !(nd->flags & LOOKUP_FOLLOW)) ||
2057 (flags & WALK_NOFOLLOW)) {
2058 /* not a symlink or should not follow */
2059 if (nd->flags & LOOKUP_RCU) {
2060 if (read_seqcount_retry(&path.dentry->d_seq, nd->next_seq))
2061 return ERR_PTR(-ECHILD);
2062 if (unlikely(!inode))
2063 return ERR_PTR(-ENOENT);
2064 } else {
2065 dput(nd->path.dentry);
2066 if (nd->path.mnt != path.mnt)
2067 mntput(nd->path.mnt);
2068 }
2069 nd->path = path;
2070 nd->inode = inode;
2071 nd->seq = nd->next_seq;
2072 return NULL;
2073 }
2074 return pick_link(nd, &path, inode, flags);
2075}
2076
2077static __always_inline const char *step_into(struct nameidata *nd, int flags,
2078 struct dentry *dentry)
2079{
2080 /*
2081 * In the common case we are in rcu-walk and traversing over a non-mounted on
2082 * directory (as opposed to e.g., a symlink).
2083 *
2084 * We can handle that and negative entries with the checks below.
2085 */
2086 if (likely((nd->flags & LOOKUP_RCU) &&
2087 !d_managed(dentry) && !d_is_symlink(dentry))) {
2088 struct inode *inode = dentry->d_inode;
2089 if (read_seqcount_retry(&dentry->d_seq, nd->next_seq))
2090 return ERR_PTR(-ECHILD);
2091 if (unlikely(!inode))
2092 return ERR_PTR(-ENOENT);
2093 nd->path.dentry = dentry;
2094 /* nd->path.mnt is retained on purpose */
2095 nd->inode = inode;
2096 nd->seq = nd->next_seq;
2097 return NULL;
2098 }
2099 return step_into_slowpath(nd, flags, dentry);
2100}
2101
2102static struct dentry *follow_dotdot_rcu(struct nameidata *nd)
2103{
2104 struct dentry *parent, *old;
2105
2106 if (path_equal(&nd->path, &nd->root))
2107 goto in_root;
2108 if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) {
2109 struct path path;
2110 unsigned seq;
2111 if (!choose_mountpoint_rcu(real_mount(nd->path.mnt),
2112 &nd->root, &path, &seq))
2113 goto in_root;
2114 if (unlikely(nd->flags & LOOKUP_NO_XDEV))
2115 return ERR_PTR(-ECHILD);
2116 nd->path = path;
2117 nd->inode = path.dentry->d_inode;
2118 nd->seq = seq;
2119 // makes sure that non-RCU pathwalk could reach this state
2120 if (read_seqretry(&mount_lock, nd->m_seq))
2121 return ERR_PTR(-ECHILD);
2122 /* we know that mountpoint was pinned */
2123 }
2124 old = nd->path.dentry;
2125 parent = old->d_parent;
2126 nd->next_seq = read_seqcount_begin(&parent->d_seq);
2127 // makes sure that non-RCU pathwalk could reach this state
2128 if (read_seqcount_retry(&old->d_seq, nd->seq))
2129 return ERR_PTR(-ECHILD);
2130 if (unlikely(!path_connected(nd->path.mnt, parent)))
2131 return ERR_PTR(-ECHILD);
2132 return parent;
2133in_root:
2134 if (read_seqretry(&mount_lock, nd->m_seq))
2135 return ERR_PTR(-ECHILD);
2136 if (unlikely(nd->flags & LOOKUP_BENEATH))
2137 return ERR_PTR(-ECHILD);
2138 nd->next_seq = nd->seq;
2139 return nd->path.dentry;
2140}
2141
2142static struct dentry *follow_dotdot(struct nameidata *nd)
2143{
2144 struct dentry *parent;
2145
2146 if (path_equal(&nd->path, &nd->root))
2147 goto in_root;
2148 if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) {
2149 struct path path;
2150
2151 if (!choose_mountpoint(real_mount(nd->path.mnt),
2152 &nd->root, &path))
2153 goto in_root;
2154 path_put(&nd->path);
2155 nd->path = path;
2156 nd->inode = path.dentry->d_inode;
2157 if (unlikely(nd->flags & LOOKUP_NO_XDEV))
2158 return ERR_PTR(-EXDEV);
2159 }
2160 /* rare case of legitimate dget_parent()... */
2161 parent = dget_parent(nd->path.dentry);
2162 if (unlikely(!path_connected(nd->path.mnt, parent))) {
2163 dput(parent);
2164 return ERR_PTR(-ENOENT);
2165 }
2166 return parent;
2167
2168in_root:
2169 if (unlikely(nd->flags & LOOKUP_BENEATH))
2170 return ERR_PTR(-EXDEV);
2171 return dget(nd->path.dentry);
2172}
2173
2174static const char *handle_dots(struct nameidata *nd, int type)
2175{
2176 if (type == LAST_DOTDOT) {
2177 const char *error = NULL;
2178 struct dentry *parent;
2179
2180 if (!nd->root.mnt) {
2181 error = ERR_PTR(set_root(nd));
2182 if (unlikely(error))
2183 return error;
2184 }
2185 if (nd->flags & LOOKUP_RCU)
2186 parent = follow_dotdot_rcu(nd);
2187 else
2188 parent = follow_dotdot(nd);
2189 if (IS_ERR(parent))
2190 return ERR_CAST(parent);
2191 error = step_into(nd, WALK_NOFOLLOW, parent);
2192 if (unlikely(error))
2193 return error;
2194
2195 if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) {
2196 /*
2197 * If there was a racing rename or mount along our
2198 * path, then we can't be sure that ".." hasn't jumped
2199 * above nd->root (and so userspace should retry or use
2200 * some fallback).
2201 */
2202 smp_rmb();
2203 if (__read_seqcount_retry(&mount_lock.seqcount, nd->m_seq))
2204 return ERR_PTR(-EAGAIN);
2205 if (__read_seqcount_retry(&rename_lock.seqcount, nd->r_seq))
2206 return ERR_PTR(-EAGAIN);
2207 }
2208 }
2209 return NULL;
2210}
2211
2212static __always_inline const char *walk_component(struct nameidata *nd, int flags)
2213{
2214 struct dentry *dentry;
2215 /*
2216 * "." and ".." are special - ".." especially so because it has
2217 * to be able to know about the current root directory and
2218 * parent relationships.
2219 */
2220 if (unlikely(nd->last_type != LAST_NORM)) {
2221 if (unlikely(nd->depth) && !(flags & WALK_MORE))
2222 put_link(nd);
2223 return handle_dots(nd, nd->last_type);
2224 }
2225 dentry = lookup_fast(nd);
2226 if (IS_ERR(dentry))
2227 return ERR_CAST(dentry);
2228 if (unlikely(!dentry)) {
2229 dentry = lookup_slow(&nd->last, nd->path.dentry, nd->flags);
2230 if (IS_ERR(dentry))
2231 return ERR_CAST(dentry);
2232 }
2233 if (unlikely(nd->depth) && !(flags & WALK_MORE))
2234 put_link(nd);
2235 return step_into(nd, flags, dentry);
2236}
2237
2238/*
2239 * We can do the critical dentry name comparison and hashing
2240 * operations one word at a time, but we are limited to:
2241 *
2242 * - Architectures with fast unaligned word accesses. We could
2243 * do a "get_unaligned()" if this helps and is sufficiently
2244 * fast.
2245 *
2246 * - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we
2247 * do not trap on the (extremely unlikely) case of a page
2248 * crossing operation.
2249 *
2250 * - Furthermore, we need an efficient 64-bit compile for the
2251 * 64-bit case in order to generate the "number of bytes in
2252 * the final mask". Again, that could be replaced with a
2253 * efficient population count instruction or similar.
2254 */
2255#ifdef CONFIG_DCACHE_WORD_ACCESS
2256
2257#include <asm/word-at-a-time.h>
2258
2259#ifdef HASH_MIX
2260
2261/* Architecture provides HASH_MIX and fold_hash() in <asm/hash.h> */
2262
2263#elif defined(CONFIG_64BIT)
2264/*
2265 * Register pressure in the mixing function is an issue, particularly
2266 * on 32-bit x86, but almost any function requires one state value and
2267 * one temporary. Instead, use a function designed for two state values
2268 * and no temporaries.
2269 *
2270 * This function cannot create a collision in only two iterations, so
2271 * we have two iterations to achieve avalanche. In those two iterations,
2272 * we have six layers of mixing, which is enough to spread one bit's
2273 * influence out to 2^6 = 64 state bits.
2274 *
2275 * Rotate constants are scored by considering either 64 one-bit input
2276 * deltas or 64*63/2 = 2016 two-bit input deltas, and finding the
2277 * probability of that delta causing a change to each of the 128 output
2278 * bits, using a sample of random initial states.
2279 *
2280 * The Shannon entropy of the computed probabilities is then summed
2281 * to produce a score. Ideally, any input change has a 50% chance of
2282 * toggling any given output bit.
2283 *
2284 * Mixing scores (in bits) for (12,45):
2285 * Input delta: 1-bit 2-bit
2286 * 1 round: 713.3 42542.6
2287 * 2 rounds: 2753.7 140389.8
2288 * 3 rounds: 5954.1 233458.2
2289 * 4 rounds: 7862.6 256672.2
2290 * Perfect: 8192 258048
2291 * (64*128) (64*63/2 * 128)
2292 */
2293#define HASH_MIX(x, y, a) \
2294 ( x ^= (a), \
2295 y ^= x, x = rol64(x,12),\
2296 x += y, y = rol64(y,45),\
2297 y *= 9 )
2298
2299/*
2300 * Fold two longs into one 32-bit hash value. This must be fast, but
2301 * latency isn't quite as critical, as there is a fair bit of additional
2302 * work done before the hash value is used.
2303 */
2304static inline unsigned int fold_hash(unsigned long x, unsigned long y)
2305{
2306 y ^= x * GOLDEN_RATIO_64;
2307 y *= GOLDEN_RATIO_64;
2308 return y >> 32;
2309}
2310
2311#else /* 32-bit case */
2312
2313/*
2314 * Mixing scores (in bits) for (7,20):
2315 * Input delta: 1-bit 2-bit
2316 * 1 round: 330.3 9201.6
2317 * 2 rounds: 1246.4 25475.4
2318 * 3 rounds: 1907.1 31295.1
2319 * 4 rounds: 2042.3 31718.6
2320 * Perfect: 2048 31744
2321 * (32*64) (32*31/2 * 64)
2322 */
2323#define HASH_MIX(x, y, a) \
2324 ( x ^= (a), \
2325 y ^= x, x = rol32(x, 7),\
2326 x += y, y = rol32(y,20),\
2327 y *= 9 )
2328
2329static inline unsigned int fold_hash(unsigned long x, unsigned long y)
2330{
2331 /* Use arch-optimized multiply if one exists */
2332 return __hash_32(y ^ __hash_32(x));
2333}
2334
2335#endif
2336
2337/*
2338 * Return the hash of a string of known length. This is carfully
2339 * designed to match hash_name(), which is the more critical function.
2340 * In particular, we must end by hashing a final word containing 0..7
2341 * payload bytes, to match the way that hash_name() iterates until it
2342 * finds the delimiter after the name.
2343 */
2344unsigned int full_name_hash(const void *salt, const char *name, unsigned int len)
2345{
2346 unsigned long a, x = 0, y = (unsigned long)salt;
2347
2348 for (;;) {
2349 if (!len)
2350 goto done;
2351 a = load_unaligned_zeropad(name);
2352 if (len < sizeof(unsigned long))
2353 break;
2354 HASH_MIX(x, y, a);
2355 name += sizeof(unsigned long);
2356 len -= sizeof(unsigned long);
2357 }
2358 x ^= a & bytemask_from_count(len);
2359done:
2360 return fold_hash(x, y);
2361}
2362EXPORT_SYMBOL(full_name_hash);
2363
2364/* Return the "hash_len" (hash and length) of a null-terminated string */
2365u64 hashlen_string(const void *salt, const char *name)
2366{
2367 unsigned long a = 0, x = 0, y = (unsigned long)salt;
2368 unsigned long adata, mask, len;
2369 const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
2370
2371 len = 0;
2372 goto inside;
2373
2374 do {
2375 HASH_MIX(x, y, a);
2376 len += sizeof(unsigned long);
2377inside:
2378 a = load_unaligned_zeropad(name+len);
2379 } while (!has_zero(a, &adata, &constants));
2380
2381 adata = prep_zero_mask(a, adata, &constants);
2382 mask = create_zero_mask(adata);
2383 x ^= a & zero_bytemask(mask);
2384
2385 return hashlen_create(fold_hash(x, y), len + find_zero(mask));
2386}
2387EXPORT_SYMBOL(hashlen_string);
2388
2389/*
2390 * Calculate the length and hash of the path component, and
2391 * return the length as the result.
2392 */
2393static inline const char *hash_name(struct nameidata *nd,
2394 const char *name,
2395 unsigned long *lastword)
2396{
2397 unsigned long a, b, x, y = (unsigned long)nd->path.dentry;
2398 unsigned long adata, bdata, mask, len;
2399 const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
2400
2401 /*
2402 * The first iteration is special, because it can result in
2403 * '.' and '..' and has no mixing other than the final fold.
2404 */
2405 a = load_unaligned_zeropad(name);
2406 b = a ^ REPEAT_BYTE('/');
2407 if (has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants)) {
2408 adata = prep_zero_mask(a, adata, &constants);
2409 bdata = prep_zero_mask(b, bdata, &constants);
2410 mask = create_zero_mask(adata | bdata);
2411 a &= zero_bytemask(mask);
2412 *lastword = a;
2413 len = find_zero(mask);
2414 nd->last.hash = fold_hash(a, y);
2415 nd->last.len = len;
2416 return name + len;
2417 }
2418
2419 len = 0;
2420 x = 0;
2421 do {
2422 HASH_MIX(x, y, a);
2423 len += sizeof(unsigned long);
2424 a = load_unaligned_zeropad(name+len);
2425 b = a ^ REPEAT_BYTE('/');
2426 } while (!(has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants)));
2427
2428 adata = prep_zero_mask(a, adata, &constants);
2429 bdata = prep_zero_mask(b, bdata, &constants);
2430 mask = create_zero_mask(adata | bdata);
2431 a &= zero_bytemask(mask);
2432 x ^= a;
2433 len += find_zero(mask);
2434 *lastword = 0; // Multi-word components cannot be DOT or DOTDOT
2435
2436 nd->last.hash = fold_hash(x, y);
2437 nd->last.len = len;
2438 return name + len;
2439}
2440
2441/*
2442 * Note that the 'last' word is always zero-masked, but
2443 * was loaded as a possibly big-endian word.
2444 */
2445#ifdef __BIG_ENDIAN
2446 #define LAST_WORD_IS_DOT (0x2eul << (BITS_PER_LONG-8))
2447 #define LAST_WORD_IS_DOTDOT (0x2e2eul << (BITS_PER_LONG-16))
2448#endif
2449
2450#else /* !CONFIG_DCACHE_WORD_ACCESS: Slow, byte-at-a-time version */
2451
2452/* Return the hash of a string of known length */
2453unsigned int full_name_hash(const void *salt, const char *name, unsigned int len)
2454{
2455 unsigned long hash = init_name_hash(salt);
2456 while (len--)
2457 hash = partial_name_hash((unsigned char)*name++, hash);
2458 return end_name_hash(hash);
2459}
2460EXPORT_SYMBOL(full_name_hash);
2461
2462/* Return the "hash_len" (hash and length) of a null-terminated string */
2463u64 hashlen_string(const void *salt, const char *name)
2464{
2465 unsigned long hash = init_name_hash(salt);
2466 unsigned long len = 0, c;
2467
2468 c = (unsigned char)*name;
2469 while (c) {
2470 len++;
2471 hash = partial_name_hash(c, hash);
2472 c = (unsigned char)name[len];
2473 }
2474 return hashlen_create(end_name_hash(hash), len);
2475}
2476EXPORT_SYMBOL(hashlen_string);
2477
2478/*
2479 * We know there's a real path component here of at least
2480 * one character.
2481 */
2482static inline const char *hash_name(struct nameidata *nd, const char *name, unsigned long *lastword)
2483{
2484 unsigned long hash = init_name_hash(nd->path.dentry);
2485 unsigned long len = 0, c, last = 0;
2486
2487 c = (unsigned char)*name;
2488 do {
2489 last = (last << 8) + c;
2490 len++;
2491 hash = partial_name_hash(c, hash);
2492 c = (unsigned char)name[len];
2493 } while (c && c != '/');
2494
2495 // This is reliable for DOT or DOTDOT, since the component
2496 // cannot contain NUL characters - top bits being zero means
2497 // we cannot have had any other pathnames.
2498 *lastword = last;
2499 nd->last.hash = end_name_hash(hash);
2500 nd->last.len = len;
2501 return name + len;
2502}
2503
2504#endif
2505
2506#ifndef LAST_WORD_IS_DOT
2507 #define LAST_WORD_IS_DOT 0x2e
2508 #define LAST_WORD_IS_DOTDOT 0x2e2e
2509#endif
2510
2511/*
2512 * Name resolution.
2513 * This is the basic name resolution function, turning a pathname into
2514 * the final dentry. We expect 'base' to be positive and a directory.
2515 *
2516 * Returns 0 and nd will have valid dentry and mnt on success.
2517 * Returns error and drops reference to input namei data on failure.
2518 */
2519static int link_path_walk(const char *name, struct nameidata *nd)
2520{
2521 int depth = 0; // depth <= nd->depth
2522 int err;
2523
2524 nd->last_type = LAST_ROOT;
2525 nd->flags |= LOOKUP_PARENT;
2526 if (IS_ERR(name))
2527 return PTR_ERR(name);
2528 if (*name == '/') {
2529 do {
2530 name++;
2531 } while (unlikely(*name == '/'));
2532 }
2533 if (unlikely(!*name)) {
2534 nd->dir_mode = 0; // short-circuit the 'hardening' idiocy
2535 return 0;
2536 }
2537
2538 /* At this point we know we have a real path component. */
2539 for(;;) {
2540 struct mnt_idmap *idmap;
2541 const char *link;
2542 unsigned long lastword;
2543
2544 idmap = mnt_idmap(nd->path.mnt);
2545 err = may_lookup(idmap, nd);
2546 if (unlikely(err))
2547 return err;
2548
2549 nd->last.name = name;
2550 name = hash_name(nd, name, &lastword);
2551
2552 switch(lastword) {
2553 case LAST_WORD_IS_DOTDOT:
2554 nd->last_type = LAST_DOTDOT;
2555 nd->state |= ND_JUMPED;
2556 break;
2557
2558 case LAST_WORD_IS_DOT:
2559 nd->last_type = LAST_DOT;
2560 break;
2561
2562 default:
2563 nd->last_type = LAST_NORM;
2564 nd->state &= ~ND_JUMPED;
2565
2566 struct dentry *parent = nd->path.dentry;
2567 if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
2568 err = parent->d_op->d_hash(parent, &nd->last);
2569 if (err < 0)
2570 return err;
2571 }
2572 }
2573
2574 if (!*name)
2575 goto OK;
2576 /*
2577 * If it wasn't NUL, we know it was '/'. Skip that
2578 * slash, and continue until no more slashes.
2579 */
2580 do {
2581 name++;
2582 } while (unlikely(*name == '/'));
2583 if (unlikely(!*name)) {
2584OK:
2585 /* pathname or trailing symlink, done */
2586 if (likely(!depth)) {
2587 nd->dir_vfsuid = i_uid_into_vfsuid(idmap, nd->inode);
2588 nd->dir_mode = nd->inode->i_mode;
2589 nd->flags &= ~LOOKUP_PARENT;
2590 return 0;
2591 }
2592 /* last component of nested symlink */
2593 name = nd->stack[--depth].name;
2594 link = walk_component(nd, 0);
2595 } else {
2596 /* not the last component */
2597 link = walk_component(nd, WALK_MORE);
2598 }
2599 if (unlikely(link)) {
2600 if (IS_ERR(link))
2601 return PTR_ERR(link);
2602 /* a symlink to follow */
2603 nd->stack[depth++].name = name;
2604 name = link;
2605 continue;
2606 }
2607 if (unlikely(!d_can_lookup(nd->path.dentry))) {
2608 if (nd->flags & LOOKUP_RCU) {
2609 if (!try_to_unlazy(nd))
2610 return -ECHILD;
2611 }
2612 return -ENOTDIR;
2613 }
2614 }
2615}
2616
2617/* must be paired with terminate_walk() */
2618static const char *path_init(struct nameidata *nd, unsigned flags)
2619{
2620 int error;
2621 const char *s = nd->pathname;
2622
2623 /* LOOKUP_CACHED requires RCU, ask caller to retry */
2624 if (unlikely((flags & (LOOKUP_RCU | LOOKUP_CACHED)) == LOOKUP_CACHED))
2625 return ERR_PTR(-EAGAIN);
2626
2627 if (unlikely(!*s))
2628 flags &= ~LOOKUP_RCU;
2629 if (flags & LOOKUP_RCU)
2630 rcu_read_lock();
2631 else
2632 nd->seq = nd->next_seq = 0;
2633
2634 nd->flags = flags;
2635 nd->state |= ND_JUMPED;
2636
2637 nd->m_seq = __read_seqcount_begin(&mount_lock.seqcount);
2638 nd->r_seq = __read_seqcount_begin(&rename_lock.seqcount);
2639 smp_rmb();
2640
2641 if (unlikely(nd->state & ND_ROOT_PRESET)) {
2642 struct dentry *root = nd->root.dentry;
2643 struct inode *inode = root->d_inode;
2644 if (*s && unlikely(!d_can_lookup(root)))
2645 return ERR_PTR(-ENOTDIR);
2646 nd->path = nd->root;
2647 nd->inode = inode;
2648 if (flags & LOOKUP_RCU) {
2649 nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
2650 nd->root_seq = nd->seq;
2651 } else {
2652 path_get(&nd->path);
2653 }
2654 return s;
2655 }
2656
2657 nd->root.mnt = NULL;
2658
2659 /* Absolute pathname -- fetch the root (LOOKUP_IN_ROOT uses nd->dfd). */
2660 if (*s == '/' && likely(!(flags & LOOKUP_IN_ROOT))) {
2661 error = nd_jump_root(nd);
2662 if (unlikely(error))
2663 return ERR_PTR(error);
2664 return s;
2665 }
2666
2667 /* Relative pathname -- get the starting-point it is relative to. */
2668 if (nd->dfd == AT_FDCWD) {
2669 if (flags & LOOKUP_RCU) {
2670 struct fs_struct *fs = current->fs;
2671 unsigned seq;
2672
2673 do {
2674 seq = read_seqbegin(&fs->seq);
2675 nd->path = fs->pwd;
2676 nd->inode = nd->path.dentry->d_inode;
2677 nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
2678 } while (read_seqretry(&fs->seq, seq));
2679 } else {
2680 get_fs_pwd(current->fs, &nd->path);
2681 nd->inode = nd->path.dentry->d_inode;
2682 }
2683 } else {
2684 /* Caller must check execute permissions on the starting path component */
2685 CLASS(fd_raw, f)(nd->dfd);
2686 struct dentry *dentry;
2687
2688 if (fd_empty(f))
2689 return ERR_PTR(-EBADF);
2690
2691 if (flags & LOOKUP_LINKAT_EMPTY) {
2692 if (fd_file(f)->f_cred != current_cred() &&
2693 !ns_capable(fd_file(f)->f_cred->user_ns, CAP_DAC_READ_SEARCH))
2694 return ERR_PTR(-ENOENT);
2695 }
2696
2697 dentry = fd_file(f)->f_path.dentry;
2698
2699 if (*s && unlikely(!d_can_lookup(dentry)))
2700 return ERR_PTR(-ENOTDIR);
2701
2702 nd->path = fd_file(f)->f_path;
2703 if (flags & LOOKUP_RCU) {
2704 nd->inode = nd->path.dentry->d_inode;
2705 nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
2706 } else {
2707 path_get(&nd->path);
2708 nd->inode = nd->path.dentry->d_inode;
2709 }
2710 }
2711
2712 /* For scoped-lookups we need to set the root to the dirfd as well. */
2713 if (unlikely(flags & LOOKUP_IS_SCOPED)) {
2714 nd->root = nd->path;
2715 if (flags & LOOKUP_RCU) {
2716 nd->root_seq = nd->seq;
2717 } else {
2718 path_get(&nd->root);
2719 nd->state |= ND_ROOT_GRABBED;
2720 }
2721 }
2722 return s;
2723}
2724
2725static inline const char *lookup_last(struct nameidata *nd)
2726{
2727 if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len])
2728 nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
2729
2730 return walk_component(nd, WALK_TRAILING);
2731}
2732
2733static int handle_lookup_down(struct nameidata *nd)
2734{
2735 if (!(nd->flags & LOOKUP_RCU))
2736 dget(nd->path.dentry);
2737 nd->next_seq = nd->seq;
2738 return PTR_ERR(step_into(nd, WALK_NOFOLLOW, nd->path.dentry));
2739}
2740
2741/* Returns 0 and nd will be valid on success; Returns error, otherwise. */
2742static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path)
2743{
2744 const char *s = path_init(nd, flags);
2745 int err;
2746
2747 if (unlikely(flags & LOOKUP_DOWN) && !IS_ERR(s)) {
2748 err = handle_lookup_down(nd);
2749 if (unlikely(err < 0))
2750 s = ERR_PTR(err);
2751 }
2752
2753 while (!(err = link_path_walk(s, nd)) &&
2754 (s = lookup_last(nd)) != NULL)
2755 ;
2756 if (!err && unlikely(nd->flags & LOOKUP_MOUNTPOINT)) {
2757 err = handle_lookup_down(nd);
2758 nd->state &= ~ND_JUMPED; // no d_weak_revalidate(), please...
2759 }
2760 if (!err)
2761 err = complete_walk(nd);
2762
2763 if (!err && nd->flags & LOOKUP_DIRECTORY)
2764 if (!d_can_lookup(nd->path.dentry))
2765 err = -ENOTDIR;
2766 if (!err) {
2767 *path = nd->path;
2768 nd->path.mnt = NULL;
2769 nd->path.dentry = NULL;
2770 }
2771 terminate_walk(nd);
2772 return err;
2773}
2774
2775int filename_lookup(int dfd, struct filename *name, unsigned flags,
2776 struct path *path, const struct path *root)
2777{
2778 int retval;
2779 struct nameidata nd;
2780 if (IS_ERR(name))
2781 return PTR_ERR(name);
2782 set_nameidata(&nd, dfd, name, root);
2783 retval = path_lookupat(&nd, flags | LOOKUP_RCU, path);
2784 if (unlikely(retval == -ECHILD))
2785 retval = path_lookupat(&nd, flags, path);
2786 if (unlikely(retval == -ESTALE))
2787 retval = path_lookupat(&nd, flags | LOOKUP_REVAL, path);
2788
2789 if (likely(!retval))
2790 audit_inode(name, path->dentry,
2791 flags & LOOKUP_MOUNTPOINT ? AUDIT_INODE_NOEVAL : 0);
2792 restore_nameidata();
2793 return retval;
2794}
2795
2796/* Returns 0 and nd will be valid on success; Returns error, otherwise. */
2797static int path_parentat(struct nameidata *nd, unsigned flags,
2798 struct path *parent)
2799{
2800 const char *s = path_init(nd, flags);
2801 int err = link_path_walk(s, nd);
2802 if (!err)
2803 err = complete_walk(nd);
2804 if (!err) {
2805 *parent = nd->path;
2806 nd->path.mnt = NULL;
2807 nd->path.dentry = NULL;
2808 }
2809 terminate_walk(nd);
2810 return err;
2811}
2812
2813/* Note: this does not consume "name" */
2814static int __filename_parentat(int dfd, struct filename *name,
2815 unsigned int flags, struct path *parent,
2816 struct qstr *last, int *type,
2817 const struct path *root)
2818{
2819 int retval;
2820 struct nameidata nd;
2821
2822 if (IS_ERR(name))
2823 return PTR_ERR(name);
2824 set_nameidata(&nd, dfd, name, root);
2825 retval = path_parentat(&nd, flags | LOOKUP_RCU, parent);
2826 if (unlikely(retval == -ECHILD))
2827 retval = path_parentat(&nd, flags, parent);
2828 if (unlikely(retval == -ESTALE))
2829 retval = path_parentat(&nd, flags | LOOKUP_REVAL, parent);
2830 if (likely(!retval)) {
2831 *last = nd.last;
2832 *type = nd.last_type;
2833 audit_inode(name, parent->dentry, AUDIT_INODE_PARENT);
2834 }
2835 restore_nameidata();
2836 return retval;
2837}
2838
2839static int filename_parentat(int dfd, struct filename *name,
2840 unsigned int flags, struct path *parent,
2841 struct qstr *last, int *type)
2842{
2843 return __filename_parentat(dfd, name, flags, parent, last, type, NULL);
2844}
2845
2846/**
2847 * __start_dirop - begin a create or remove dirop, performing locking and lookup
2848 * @parent: the dentry of the parent in which the operation will occur
2849 * @name: a qstr holding the name within that parent
2850 * @lookup_flags: intent and other lookup flags.
2851 * @state: task state bitmask
2852 *
2853 * The lookup is performed and necessary locks are taken so that, on success,
2854 * the returned dentry can be operated on safely.
2855 * The qstr must already have the hash value calculated.
2856 *
2857 * Returns: a locked dentry, or an error.
2858 *
2859 */
2860static struct dentry *__start_dirop(struct dentry *parent, struct qstr *name,
2861 unsigned int lookup_flags,
2862 unsigned int state)
2863{
2864 struct dentry *dentry;
2865 struct inode *dir = d_inode(parent);
2866
2867 if (state == TASK_KILLABLE) {
2868 int ret = down_write_killable_nested(&dir->i_rwsem,
2869 I_MUTEX_PARENT);
2870 if (ret)
2871 return ERR_PTR(ret);
2872 } else {
2873 inode_lock_nested(dir, I_MUTEX_PARENT);
2874 }
2875 dentry = lookup_one_qstr_excl(name, parent, lookup_flags);
2876 if (IS_ERR(dentry))
2877 inode_unlock(dir);
2878 return dentry;
2879}
2880
2881struct dentry *start_dirop(struct dentry *parent, struct qstr *name,
2882 unsigned int lookup_flags)
2883{
2884 return __start_dirop(parent, name, lookup_flags, TASK_NORMAL);
2885}
2886
2887/**
2888 * end_dirop - signal completion of a dirop
2889 * @de: the dentry which was returned by start_dirop or similar.
2890 *
2891 * If the de is an error, nothing happens. Otherwise any lock taken to
2892 * protect the dentry is dropped and the dentry itself is release (dput()).
2893 */
2894void end_dirop(struct dentry *de)
2895{
2896 if (!IS_ERR(de)) {
2897 inode_unlock(de->d_parent->d_inode);
2898 dput(de);
2899 }
2900}
2901EXPORT_SYMBOL(end_dirop);
2902
2903/* does lookup, returns the object with parent locked */
2904static struct dentry *__start_removing_path(int dfd, struct filename *name,
2905 struct path *path)
2906{
2907 struct path parent_path __free(path_put) = {};
2908 struct dentry *d;
2909 struct qstr last;
2910 int type, error;
2911
2912 error = filename_parentat(dfd, name, 0, &parent_path, &last, &type);
2913 if (error)
2914 return ERR_PTR(error);
2915 if (unlikely(type != LAST_NORM))
2916 return ERR_PTR(-EINVAL);
2917 /* don't fail immediately if it's r/o, at least try to report other errors */
2918 error = mnt_want_write(parent_path.mnt);
2919 d = start_dirop(parent_path.dentry, &last, 0);
2920 if (IS_ERR(d))
2921 goto drop;
2922 if (error)
2923 goto fail;
2924 path->dentry = no_free_ptr(parent_path.dentry);
2925 path->mnt = no_free_ptr(parent_path.mnt);
2926 return d;
2927
2928fail:
2929 end_dirop(d);
2930 d = ERR_PTR(error);
2931drop:
2932 if (!error)
2933 mnt_drop_write(parent_path.mnt);
2934 return d;
2935}
2936
2937/**
2938 * kern_path_parent: lookup path returning parent and target
2939 * @name: path name
2940 * @path: path to store parent in
2941 *
2942 * The path @name should end with a normal component, not "." or ".." or "/".
2943 * A lookup is performed and if successful the parent information
2944 * is store in @parent and the dentry is returned.
2945 *
2946 * The dentry maybe negative, the parent will be positive.
2947 *
2948 * Returns: dentry or error.
2949 */
2950struct dentry *kern_path_parent(const char *name, struct path *path)
2951{
2952 struct path parent_path __free(path_put) = {};
2953 struct filename *filename __free(putname) = getname_kernel(name);
2954 struct dentry *d;
2955 struct qstr last;
2956 int type, error;
2957
2958 error = filename_parentat(AT_FDCWD, filename, 0, &parent_path, &last, &type);
2959 if (error)
2960 return ERR_PTR(error);
2961 if (unlikely(type != LAST_NORM))
2962 return ERR_PTR(-EINVAL);
2963
2964 d = lookup_noperm_unlocked(&last, parent_path.dentry);
2965 if (IS_ERR(d))
2966 return d;
2967 path->dentry = no_free_ptr(parent_path.dentry);
2968 path->mnt = no_free_ptr(parent_path.mnt);
2969 return d;
2970}
2971
2972struct dentry *start_removing_path(const char *name, struct path *path)
2973{
2974 struct filename *filename = getname_kernel(name);
2975 struct dentry *res = __start_removing_path(AT_FDCWD, filename, path);
2976
2977 putname(filename);
2978 return res;
2979}
2980
2981struct dentry *start_removing_user_path_at(int dfd,
2982 const char __user *name,
2983 struct path *path)
2984{
2985 struct filename *filename = getname(name);
2986 struct dentry *res = __start_removing_path(dfd, filename, path);
2987
2988 putname(filename);
2989 return res;
2990}
2991EXPORT_SYMBOL(start_removing_user_path_at);
2992
2993int kern_path(const char *name, unsigned int flags, struct path *path)
2994{
2995 struct filename *filename = getname_kernel(name);
2996 int ret = filename_lookup(AT_FDCWD, filename, flags, path, NULL);
2997
2998 putname(filename);
2999 return ret;
3000
3001}
3002EXPORT_SYMBOL(kern_path);
3003
3004/**
3005 * vfs_path_parent_lookup - lookup a parent path relative to a dentry-vfsmount pair
3006 * @filename: filename structure
3007 * @flags: lookup flags
3008 * @parent: pointer to struct path to fill
3009 * @last: last component
3010 * @type: type of the last component
3011 * @root: pointer to struct path of the base directory
3012 */
3013int vfs_path_parent_lookup(struct filename *filename, unsigned int flags,
3014 struct path *parent, struct qstr *last, int *type,
3015 const struct path *root)
3016{
3017 return __filename_parentat(AT_FDCWD, filename, flags, parent, last,
3018 type, root);
3019}
3020EXPORT_SYMBOL(vfs_path_parent_lookup);
3021
3022/**
3023 * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair
3024 * @dentry: pointer to dentry of the base directory
3025 * @mnt: pointer to vfs mount of the base directory
3026 * @name: pointer to file name
3027 * @flags: lookup flags
3028 * @path: pointer to struct path to fill
3029 */
3030int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
3031 const char *name, unsigned int flags,
3032 struct path *path)
3033{
3034 struct filename *filename;
3035 struct path root = {.mnt = mnt, .dentry = dentry};
3036 int ret;
3037
3038 filename = getname_kernel(name);
3039 /* the first argument of filename_lookup() is ignored with root */
3040 ret = filename_lookup(AT_FDCWD, filename, flags, path, &root);
3041 putname(filename);
3042 return ret;
3043}
3044EXPORT_SYMBOL(vfs_path_lookup);
3045
3046int lookup_noperm_common(struct qstr *qname, struct dentry *base)
3047{
3048 const char *name = qname->name;
3049 u32 len = qname->len;
3050
3051 qname->hash = full_name_hash(base, name, len);
3052 if (!len)
3053 return -EACCES;
3054
3055 if (is_dot_dotdot(name, len))
3056 return -EACCES;
3057
3058 while (len--) {
3059 unsigned int c = *(const unsigned char *)name++;
3060 if (c == '/' || c == '\0')
3061 return -EACCES;
3062 }
3063 /*
3064 * See if the low-level filesystem might want
3065 * to use its own hash..
3066 */
3067 if (base->d_flags & DCACHE_OP_HASH) {
3068 int err = base->d_op->d_hash(base, qname);
3069 if (err < 0)
3070 return err;
3071 }
3072 return 0;
3073}
3074
3075static int lookup_one_common(struct mnt_idmap *idmap,
3076 struct qstr *qname, struct dentry *base)
3077{
3078 int err;
3079 err = lookup_noperm_common(qname, base);
3080 if (err < 0)
3081 return err;
3082 return inode_permission(idmap, base->d_inode, MAY_EXEC);
3083}
3084
3085/**
3086 * try_lookup_noperm - filesystem helper to lookup single pathname component
3087 * @name: qstr storing pathname component to lookup
3088 * @base: base directory to lookup from
3089 *
3090 * Look up a dentry by name in the dcache, returning NULL if it does not
3091 * currently exist. The function does not try to create a dentry and if one
3092 * is found it doesn't try to revalidate it.
3093 *
3094 * Note that this routine is purely a helper for filesystem usage and should
3095 * not be called by generic code. It does no permission checking.
3096 *
3097 * No locks need be held - only a counted reference to @base is needed.
3098 *
3099 */
3100struct dentry *try_lookup_noperm(struct qstr *name, struct dentry *base)
3101{
3102 int err;
3103
3104 err = lookup_noperm_common(name, base);
3105 if (err)
3106 return ERR_PTR(err);
3107
3108 return d_lookup(base, name);
3109}
3110EXPORT_SYMBOL(try_lookup_noperm);
3111
3112/**
3113 * lookup_noperm - filesystem helper to lookup single pathname component
3114 * @name: qstr storing pathname component to lookup
3115 * @base: base directory to lookup from
3116 *
3117 * Note that this routine is purely a helper for filesystem usage and should
3118 * not be called by generic code. It does no permission checking.
3119 *
3120 * The caller must hold base->i_rwsem.
3121 */
3122struct dentry *lookup_noperm(struct qstr *name, struct dentry *base)
3123{
3124 struct dentry *dentry;
3125 int err;
3126
3127 WARN_ON_ONCE(!inode_is_locked(base->d_inode));
3128
3129 err = lookup_noperm_common(name, base);
3130 if (err)
3131 return ERR_PTR(err);
3132
3133 dentry = lookup_dcache(name, base, 0);
3134 return dentry ? dentry : __lookup_slow(name, base, 0);
3135}
3136EXPORT_SYMBOL(lookup_noperm);
3137
3138/**
3139 * lookup_one - lookup single pathname component
3140 * @idmap: idmap of the mount the lookup is performed from
3141 * @name: qstr holding pathname component to lookup
3142 * @base: base directory to lookup from
3143 *
3144 * This can be used for in-kernel filesystem clients such as file servers.
3145 *
3146 * The caller must hold base->i_rwsem.
3147 */
3148struct dentry *lookup_one(struct mnt_idmap *idmap, struct qstr *name,
3149 struct dentry *base)
3150{
3151 struct dentry *dentry;
3152 int err;
3153
3154 WARN_ON_ONCE(!inode_is_locked(base->d_inode));
3155
3156 err = lookup_one_common(idmap, name, base);
3157 if (err)
3158 return ERR_PTR(err);
3159
3160 dentry = lookup_dcache(name, base, 0);
3161 return dentry ? dentry : __lookup_slow(name, base, 0);
3162}
3163EXPORT_SYMBOL(lookup_one);
3164
3165/**
3166 * lookup_one_unlocked - lookup single pathname component
3167 * @idmap: idmap of the mount the lookup is performed from
3168 * @name: qstr olding pathname component to lookup
3169 * @base: base directory to lookup from
3170 *
3171 * This can be used for in-kernel filesystem clients such as file servers.
3172 *
3173 * Unlike lookup_one, it should be called without the parent
3174 * i_rwsem held, and will take the i_rwsem itself if necessary.
3175 */
3176struct dentry *lookup_one_unlocked(struct mnt_idmap *idmap, struct qstr *name,
3177 struct dentry *base)
3178{
3179 int err;
3180 struct dentry *ret;
3181
3182 err = lookup_one_common(idmap, name, base);
3183 if (err)
3184 return ERR_PTR(err);
3185
3186 ret = lookup_dcache(name, base, 0);
3187 if (!ret)
3188 ret = lookup_slow(name, base, 0);
3189 return ret;
3190}
3191EXPORT_SYMBOL(lookup_one_unlocked);
3192
3193/**
3194 * lookup_one_positive_killable - lookup single pathname component
3195 * @idmap: idmap of the mount the lookup is performed from
3196 * @name: qstr olding pathname component to lookup
3197 * @base: base directory to lookup from
3198 *
3199 * This helper will yield ERR_PTR(-ENOENT) on negatives. The helper returns
3200 * known positive or ERR_PTR(). This is what most of the users want.
3201 *
3202 * Note that pinned negative with unlocked parent _can_ become positive at any
3203 * time, so callers of lookup_one_unlocked() need to be very careful; pinned
3204 * positives have >d_inode stable, so this one avoids such problems.
3205 *
3206 * This can be used for in-kernel filesystem clients such as file servers.
3207 *
3208 * It should be called without the parent i_rwsem held, and will take
3209 * the i_rwsem itself if necessary. If a fatal signal is pending or
3210 * delivered, it will return %-EINTR if the lock is needed.
3211 */
3212struct dentry *lookup_one_positive_killable(struct mnt_idmap *idmap,
3213 struct qstr *name,
3214 struct dentry *base)
3215{
3216 int err;
3217 struct dentry *ret;
3218
3219 err = lookup_one_common(idmap, name, base);
3220 if (err)
3221 return ERR_PTR(err);
3222
3223 ret = lookup_dcache(name, base, 0);
3224 if (!ret)
3225 ret = lookup_slow_killable(name, base, 0);
3226 if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) {
3227 dput(ret);
3228 ret = ERR_PTR(-ENOENT);
3229 }
3230 return ret;
3231}
3232EXPORT_SYMBOL(lookup_one_positive_killable);
3233
3234/**
3235 * lookup_one_positive_unlocked - lookup single pathname component
3236 * @idmap: idmap of the mount the lookup is performed from
3237 * @name: qstr holding pathname component to lookup
3238 * @base: base directory to lookup from
3239 *
3240 * This helper will yield ERR_PTR(-ENOENT) on negatives. The helper returns
3241 * known positive or ERR_PTR(). This is what most of the users want.
3242 *
3243 * Note that pinned negative with unlocked parent _can_ become positive at any
3244 * time, so callers of lookup_one_unlocked() need to be very careful; pinned
3245 * positives have >d_inode stable, so this one avoids such problems.
3246 *
3247 * This can be used for in-kernel filesystem clients such as file servers.
3248 *
3249 * The helper should be called without i_rwsem held.
3250 */
3251struct dentry *lookup_one_positive_unlocked(struct mnt_idmap *idmap,
3252 struct qstr *name,
3253 struct dentry *base)
3254{
3255 struct dentry *ret = lookup_one_unlocked(idmap, name, base);
3256
3257 if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) {
3258 dput(ret);
3259 ret = ERR_PTR(-ENOENT);
3260 }
3261 return ret;
3262}
3263EXPORT_SYMBOL(lookup_one_positive_unlocked);
3264
3265/**
3266 * lookup_noperm_unlocked - filesystem helper to lookup single pathname component
3267 * @name: pathname component to lookup
3268 * @base: base directory to lookup from
3269 *
3270 * Note that this routine is purely a helper for filesystem usage and should
3271 * not be called by generic code. It does no permission checking.
3272 *
3273 * Unlike lookup_noperm(), it should be called without the parent
3274 * i_rwsem held, and will take the i_rwsem itself if necessary.
3275 *
3276 * Unlike try_lookup_noperm() it *does* revalidate the dentry if it already
3277 * existed.
3278 */
3279struct dentry *lookup_noperm_unlocked(struct qstr *name, struct dentry *base)
3280{
3281 struct dentry *ret;
3282 int err;
3283
3284 err = lookup_noperm_common(name, base);
3285 if (err)
3286 return ERR_PTR(err);
3287
3288 ret = lookup_dcache(name, base, 0);
3289 if (!ret)
3290 ret = lookup_slow(name, base, 0);
3291 return ret;
3292}
3293EXPORT_SYMBOL(lookup_noperm_unlocked);
3294
3295/*
3296 * Like lookup_noperm_unlocked(), except that it yields ERR_PTR(-ENOENT)
3297 * on negatives. Returns known positive or ERR_PTR(); that's what
3298 * most of the users want. Note that pinned negative with unlocked parent
3299 * _can_ become positive at any time, so callers of lookup_noperm_unlocked()
3300 * need to be very careful; pinned positives have ->d_inode stable, so
3301 * this one avoids such problems.
3302 */
3303struct dentry *lookup_noperm_positive_unlocked(struct qstr *name,
3304 struct dentry *base)
3305{
3306 struct dentry *ret;
3307
3308 ret = lookup_noperm_unlocked(name, base);
3309 if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) {
3310 dput(ret);
3311 ret = ERR_PTR(-ENOENT);
3312 }
3313 return ret;
3314}
3315EXPORT_SYMBOL(lookup_noperm_positive_unlocked);
3316
3317/**
3318 * start_creating - prepare to create a given name with permission checking
3319 * @idmap: idmap of the mount
3320 * @parent: directory in which to prepare to create the name
3321 * @name: the name to be created
3322 *
3323 * Locks are taken and a lookup is performed prior to creating
3324 * an object in a directory. Permission checking (MAY_EXEC) is performed
3325 * against @idmap.
3326 *
3327 * If the name already exists, a positive dentry is returned, so
3328 * behaviour is similar to O_CREAT without O_EXCL, which doesn't fail
3329 * with -EEXIST.
3330 *
3331 * Returns: a negative or positive dentry, or an error.
3332 */
3333struct dentry *start_creating(struct mnt_idmap *idmap, struct dentry *parent,
3334 struct qstr *name)
3335{
3336 int err = lookup_one_common(idmap, name, parent);
3337
3338 if (err)
3339 return ERR_PTR(err);
3340 return start_dirop(parent, name, LOOKUP_CREATE);
3341}
3342EXPORT_SYMBOL(start_creating);
3343
3344/**
3345 * start_removing - prepare to remove a given name with permission checking
3346 * @idmap: idmap of the mount
3347 * @parent: directory in which to find the name
3348 * @name: the name to be removed
3349 *
3350 * Locks are taken and a lookup in performed prior to removing
3351 * an object from a directory. Permission checking (MAY_EXEC) is performed
3352 * against @idmap.
3353 *
3354 * If the name doesn't exist, an error is returned.
3355 *
3356 * end_removing() should be called when removal is complete, or aborted.
3357 *
3358 * Returns: a positive dentry, or an error.
3359 */
3360struct dentry *start_removing(struct mnt_idmap *idmap, struct dentry *parent,
3361 struct qstr *name)
3362{
3363 int err = lookup_one_common(idmap, name, parent);
3364
3365 if (err)
3366 return ERR_PTR(err);
3367 return start_dirop(parent, name, 0);
3368}
3369EXPORT_SYMBOL(start_removing);
3370
3371/**
3372 * start_creating_killable - prepare to create a given name with permission checking
3373 * @idmap: idmap of the mount
3374 * @parent: directory in which to prepare to create the name
3375 * @name: the name to be created
3376 *
3377 * Locks are taken and a lookup in performed prior to creating
3378 * an object in a directory. Permission checking (MAY_EXEC) is performed
3379 * against @idmap.
3380 *
3381 * If the name already exists, a positive dentry is returned.
3382 *
3383 * If a signal is received or was already pending, the function aborts
3384 * with -EINTR;
3385 *
3386 * Returns: a negative or positive dentry, or an error.
3387 */
3388struct dentry *start_creating_killable(struct mnt_idmap *idmap,
3389 struct dentry *parent,
3390 struct qstr *name)
3391{
3392 int err = lookup_one_common(idmap, name, parent);
3393
3394 if (err)
3395 return ERR_PTR(err);
3396 return __start_dirop(parent, name, LOOKUP_CREATE, TASK_KILLABLE);
3397}
3398EXPORT_SYMBOL(start_creating_killable);
3399
3400/**
3401 * start_removing_killable - prepare to remove a given name with permission checking
3402 * @idmap: idmap of the mount
3403 * @parent: directory in which to find the name
3404 * @name: the name to be removed
3405 *
3406 * Locks are taken and a lookup in performed prior to removing
3407 * an object from a directory. Permission checking (MAY_EXEC) is performed
3408 * against @idmap.
3409 *
3410 * If the name doesn't exist, an error is returned.
3411 *
3412 * end_removing() should be called when removal is complete, or aborted.
3413 *
3414 * If a signal is received or was already pending, the function aborts
3415 * with -EINTR;
3416 *
3417 * Returns: a positive dentry, or an error.
3418 */
3419struct dentry *start_removing_killable(struct mnt_idmap *idmap,
3420 struct dentry *parent,
3421 struct qstr *name)
3422{
3423 int err = lookup_one_common(idmap, name, parent);
3424
3425 if (err)
3426 return ERR_PTR(err);
3427 return __start_dirop(parent, name, 0, TASK_KILLABLE);
3428}
3429EXPORT_SYMBOL(start_removing_killable);
3430
3431/**
3432 * start_creating_noperm - prepare to create a given name without permission checking
3433 * @parent: directory in which to prepare to create the name
3434 * @name: the name to be created
3435 *
3436 * Locks are taken and a lookup in performed prior to creating
3437 * an object in a directory.
3438 *
3439 * If the name already exists, a positive dentry is returned.
3440 *
3441 * Returns: a negative or positive dentry, or an error.
3442 */
3443struct dentry *start_creating_noperm(struct dentry *parent,
3444 struct qstr *name)
3445{
3446 int err = lookup_noperm_common(name, parent);
3447
3448 if (err)
3449 return ERR_PTR(err);
3450 return start_dirop(parent, name, LOOKUP_CREATE);
3451}
3452EXPORT_SYMBOL(start_creating_noperm);
3453
3454/**
3455 * start_removing_noperm - prepare to remove a given name without permission checking
3456 * @parent: directory in which to find the name
3457 * @name: the name to be removed
3458 *
3459 * Locks are taken and a lookup in performed prior to removing
3460 * an object from a directory.
3461 *
3462 * If the name doesn't exist, an error is returned.
3463 *
3464 * end_removing() should be called when removal is complete, or aborted.
3465 *
3466 * Returns: a positive dentry, or an error.
3467 */
3468struct dentry *start_removing_noperm(struct dentry *parent,
3469 struct qstr *name)
3470{
3471 int err = lookup_noperm_common(name, parent);
3472
3473 if (err)
3474 return ERR_PTR(err);
3475 return start_dirop(parent, name, 0);
3476}
3477EXPORT_SYMBOL(start_removing_noperm);
3478
3479/**
3480 * start_creating_dentry - prepare to create a given dentry
3481 * @parent: directory from which dentry should be removed
3482 * @child: the dentry to be removed
3483 *
3484 * A lock is taken to protect the dentry again other dirops and
3485 * the validity of the dentry is checked: correct parent and still hashed.
3486 *
3487 * If the dentry is valid and negative a reference is taken and
3488 * returned. If not an error is returned.
3489 *
3490 * end_creating() should be called when creation is complete, or aborted.
3491 *
3492 * Returns: the valid dentry, or an error.
3493 */
3494struct dentry *start_creating_dentry(struct dentry *parent,
3495 struct dentry *child)
3496{
3497 inode_lock_nested(parent->d_inode, I_MUTEX_PARENT);
3498 if (unlikely(IS_DEADDIR(parent->d_inode) ||
3499 child->d_parent != parent ||
3500 d_unhashed(child))) {
3501 inode_unlock(parent->d_inode);
3502 return ERR_PTR(-EINVAL);
3503 }
3504 if (d_is_positive(child)) {
3505 inode_unlock(parent->d_inode);
3506 return ERR_PTR(-EEXIST);
3507 }
3508 return dget(child);
3509}
3510EXPORT_SYMBOL(start_creating_dentry);
3511
3512/**
3513 * start_removing_dentry - prepare to remove a given dentry
3514 * @parent: directory from which dentry should be removed
3515 * @child: the dentry to be removed
3516 *
3517 * A lock is taken to protect the dentry again other dirops and
3518 * the validity of the dentry is checked: correct parent and still hashed.
3519 *
3520 * If the dentry is valid and positive, a reference is taken and
3521 * returned. If not an error is returned.
3522 *
3523 * end_removing() should be called when removal is complete, or aborted.
3524 *
3525 * Returns: the valid dentry, or an error.
3526 */
3527struct dentry *start_removing_dentry(struct dentry *parent,
3528 struct dentry *child)
3529{
3530 inode_lock_nested(parent->d_inode, I_MUTEX_PARENT);
3531 if (unlikely(IS_DEADDIR(parent->d_inode) ||
3532 child->d_parent != parent ||
3533 d_unhashed(child))) {
3534 inode_unlock(parent->d_inode);
3535 return ERR_PTR(-EINVAL);
3536 }
3537 if (d_is_negative(child)) {
3538 inode_unlock(parent->d_inode);
3539 return ERR_PTR(-ENOENT);
3540 }
3541 return dget(child);
3542}
3543EXPORT_SYMBOL(start_removing_dentry);
3544
3545#ifdef CONFIG_UNIX98_PTYS
3546int path_pts(struct path *path)
3547{
3548 /* Find something mounted on "pts" in the same directory as
3549 * the input path.
3550 */
3551 struct dentry *parent = dget_parent(path->dentry);
3552 struct dentry *child;
3553 struct qstr this = QSTR_INIT("pts", 3);
3554
3555 if (unlikely(!path_connected(path->mnt, parent))) {
3556 dput(parent);
3557 return -ENOENT;
3558 }
3559 dput(path->dentry);
3560 path->dentry = parent;
3561 child = d_hash_and_lookup(parent, &this);
3562 if (IS_ERR_OR_NULL(child))
3563 return -ENOENT;
3564
3565 path->dentry = child;
3566 dput(parent);
3567 follow_down(path, 0);
3568 return 0;
3569}
3570#endif
3571
3572int user_path_at(int dfd, const char __user *name, unsigned flags,
3573 struct path *path)
3574{
3575 struct filename *filename = getname_flags(name, flags);
3576 int ret = filename_lookup(dfd, filename, flags, path, NULL);
3577
3578 putname(filename);
3579 return ret;
3580}
3581EXPORT_SYMBOL(user_path_at);
3582
3583int __check_sticky(struct mnt_idmap *idmap, struct inode *dir,
3584 struct inode *inode)
3585{
3586 kuid_t fsuid = current_fsuid();
3587
3588 if (vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, inode), fsuid))
3589 return 0;
3590 if (vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, dir), fsuid))
3591 return 0;
3592 return !capable_wrt_inode_uidgid(idmap, inode, CAP_FOWNER);
3593}
3594EXPORT_SYMBOL(__check_sticky);
3595
3596/*
3597 * Check whether we can remove a link victim from directory dir, check
3598 * whether the type of victim is right.
3599 * 1. We can't do it if dir is read-only (done in permission())
3600 * 2. We should have write and exec permissions on dir
3601 * 3. We can't remove anything from append-only dir
3602 * 4. We can't do anything with immutable dir (done in permission())
3603 * 5. If the sticky bit on dir is set we should either
3604 * a. be owner of dir, or
3605 * b. be owner of victim, or
3606 * c. have CAP_FOWNER capability
3607 * 6. If the victim is append-only or immutable we can't do antyhing with
3608 * links pointing to it.
3609 * 7. If the victim has an unknown uid or gid we can't change the inode.
3610 * 8. If we were asked to remove a directory and victim isn't one - ENOTDIR.
3611 * 9. If we were asked to remove a non-directory and victim isn't one - EISDIR.
3612 * 10. We can't remove a root or mountpoint.
3613 * 11. We don't allow removal of NFS sillyrenamed files; it's handled by
3614 * nfs_async_unlink().
3615 */
3616static int may_delete(struct mnt_idmap *idmap, struct inode *dir,
3617 struct dentry *victim, bool isdir)
3618{
3619 struct inode *inode = d_backing_inode(victim);
3620 int error;
3621
3622 if (d_is_negative(victim))
3623 return -ENOENT;
3624 BUG_ON(!inode);
3625
3626 BUG_ON(victim->d_parent->d_inode != dir);
3627
3628 /* Inode writeback is not safe when the uid or gid are invalid. */
3629 if (!vfsuid_valid(i_uid_into_vfsuid(idmap, inode)) ||
3630 !vfsgid_valid(i_gid_into_vfsgid(idmap, inode)))
3631 return -EOVERFLOW;
3632
3633 audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
3634
3635 error = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC);
3636 if (error)
3637 return error;
3638 if (IS_APPEND(dir))
3639 return -EPERM;
3640
3641 if (check_sticky(idmap, dir, inode) || IS_APPEND(inode) ||
3642 IS_IMMUTABLE(inode) || IS_SWAPFILE(inode) ||
3643 HAS_UNMAPPED_ID(idmap, inode))
3644 return -EPERM;
3645 if (isdir) {
3646 if (!d_is_dir(victim))
3647 return -ENOTDIR;
3648 if (IS_ROOT(victim))
3649 return -EBUSY;
3650 } else if (d_is_dir(victim))
3651 return -EISDIR;
3652 if (IS_DEADDIR(dir))
3653 return -ENOENT;
3654 if (victim->d_flags & DCACHE_NFSFS_RENAMED)
3655 return -EBUSY;
3656 return 0;
3657}
3658
3659/* Check whether we can create an object with dentry child in directory
3660 * dir.
3661 * 1. We can't do it if child already exists (open has special treatment for
3662 * this case, but since we are inlined it's OK)
3663 * 2. We can't do it if dir is read-only (done in permission())
3664 * 3. We can't do it if the fs can't represent the fsuid or fsgid.
3665 * 4. We should have write and exec permissions on dir
3666 * 5. We can't do it if dir is immutable (done in permission())
3667 */
3668static inline int may_create(struct mnt_idmap *idmap,
3669 struct inode *dir, struct dentry *child)
3670{
3671 audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE);
3672 if (child->d_inode)
3673 return -EEXIST;
3674 if (IS_DEADDIR(dir))
3675 return -ENOENT;
3676 if (!fsuidgid_has_mapping(dir->i_sb, idmap))
3677 return -EOVERFLOW;
3678
3679 return inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC);
3680}
3681
3682// p1 != p2, both are on the same filesystem, ->s_vfs_rename_mutex is held
3683static struct dentry *lock_two_directories(struct dentry *p1, struct dentry *p2)
3684{
3685 struct dentry *p = p1, *q = p2, *r;
3686
3687 while ((r = p->d_parent) != p2 && r != p)
3688 p = r;
3689 if (r == p2) {
3690 // p is a child of p2 and an ancestor of p1 or p1 itself
3691 inode_lock_nested(p2->d_inode, I_MUTEX_PARENT);
3692 inode_lock_nested(p1->d_inode, I_MUTEX_PARENT2);
3693 return p;
3694 }
3695 // p is the root of connected component that contains p1
3696 // p2 does not occur on the path from p to p1
3697 while ((r = q->d_parent) != p1 && r != p && r != q)
3698 q = r;
3699 if (r == p1) {
3700 // q is a child of p1 and an ancestor of p2 or p2 itself
3701 inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
3702 inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2);
3703 return q;
3704 } else if (likely(r == p)) {
3705 // both p2 and p1 are descendents of p
3706 inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
3707 inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2);
3708 return NULL;
3709 } else { // no common ancestor at the time we'd been called
3710 mutex_unlock(&p1->d_sb->s_vfs_rename_mutex);
3711 return ERR_PTR(-EXDEV);
3712 }
3713}
3714
3715/*
3716 * p1 and p2 should be directories on the same fs.
3717 */
3718struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
3719{
3720 if (p1 == p2) {
3721 inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
3722 return NULL;
3723 }
3724
3725 mutex_lock(&p1->d_sb->s_vfs_rename_mutex);
3726 return lock_two_directories(p1, p2);
3727}
3728EXPORT_SYMBOL(lock_rename);
3729
3730/*
3731 * c1 and p2 should be on the same fs.
3732 */
3733struct dentry *lock_rename_child(struct dentry *c1, struct dentry *p2)
3734{
3735 if (READ_ONCE(c1->d_parent) == p2) {
3736 /*
3737 * hopefully won't need to touch ->s_vfs_rename_mutex at all.
3738 */
3739 inode_lock_nested(p2->d_inode, I_MUTEX_PARENT);
3740 /*
3741 * now that p2 is locked, nobody can move in or out of it,
3742 * so the test below is safe.
3743 */
3744 if (likely(c1->d_parent == p2))
3745 return NULL;
3746
3747 /*
3748 * c1 got moved out of p2 while we'd been taking locks;
3749 * unlock and fall back to slow case.
3750 */
3751 inode_unlock(p2->d_inode);
3752 }
3753
3754 mutex_lock(&c1->d_sb->s_vfs_rename_mutex);
3755 /*
3756 * nobody can move out of any directories on this fs.
3757 */
3758 if (likely(c1->d_parent != p2))
3759 return lock_two_directories(c1->d_parent, p2);
3760
3761 /*
3762 * c1 got moved into p2 while we were taking locks;
3763 * we need p2 locked and ->s_vfs_rename_mutex unlocked,
3764 * for consistency with lock_rename().
3765 */
3766 inode_lock_nested(p2->d_inode, I_MUTEX_PARENT);
3767 mutex_unlock(&c1->d_sb->s_vfs_rename_mutex);
3768 return NULL;
3769}
3770EXPORT_SYMBOL(lock_rename_child);
3771
3772void unlock_rename(struct dentry *p1, struct dentry *p2)
3773{
3774 inode_unlock(p1->d_inode);
3775 if (p1 != p2) {
3776 inode_unlock(p2->d_inode);
3777 mutex_unlock(&p1->d_sb->s_vfs_rename_mutex);
3778 }
3779}
3780EXPORT_SYMBOL(unlock_rename);
3781
3782/**
3783 * __start_renaming - lookup and lock names for rename
3784 * @rd: rename data containing parents and flags, and
3785 * for receiving found dentries
3786 * @lookup_flags: extra flags to pass to ->lookup (e.g. LOOKUP_REVAL,
3787 * LOOKUP_NO_SYMLINKS etc).
3788 * @old_last: name of object in @rd.old_parent
3789 * @new_last: name of object in @rd.new_parent
3790 *
3791 * Look up two names and ensure locks are in place for
3792 * rename.
3793 *
3794 * On success the found dentries are stored in @rd.old_dentry,
3795 * @rd.new_dentry and an extra ref is taken on @rd.old_parent.
3796 * These references and the lock are dropped by end_renaming().
3797 *
3798 * The passed in qstrs must have the hash calculated, and no permission
3799 * checking is performed.
3800 *
3801 * Returns: zero or an error.
3802 */
3803static int
3804__start_renaming(struct renamedata *rd, int lookup_flags,
3805 struct qstr *old_last, struct qstr *new_last)
3806{
3807 struct dentry *trap;
3808 struct dentry *d1, *d2;
3809 int target_flags = LOOKUP_RENAME_TARGET | LOOKUP_CREATE;
3810 int err;
3811
3812 if (rd->flags & RENAME_EXCHANGE)
3813 target_flags = 0;
3814 if (rd->flags & RENAME_NOREPLACE)
3815 target_flags |= LOOKUP_EXCL;
3816
3817 trap = lock_rename(rd->old_parent, rd->new_parent);
3818 if (IS_ERR(trap))
3819 return PTR_ERR(trap);
3820
3821 d1 = lookup_one_qstr_excl(old_last, rd->old_parent,
3822 lookup_flags);
3823 err = PTR_ERR(d1);
3824 if (IS_ERR(d1))
3825 goto out_unlock;
3826
3827 d2 = lookup_one_qstr_excl(new_last, rd->new_parent,
3828 lookup_flags | target_flags);
3829 err = PTR_ERR(d2);
3830 if (IS_ERR(d2))
3831 goto out_dput_d1;
3832
3833 if (d1 == trap) {
3834 /* source is an ancestor of target */
3835 err = -EINVAL;
3836 goto out_dput_d2;
3837 }
3838
3839 if (d2 == trap) {
3840 /* target is an ancestor of source */
3841 if (rd->flags & RENAME_EXCHANGE)
3842 err = -EINVAL;
3843 else
3844 err = -ENOTEMPTY;
3845 goto out_dput_d2;
3846 }
3847
3848 rd->old_dentry = d1;
3849 rd->new_dentry = d2;
3850 dget(rd->old_parent);
3851 return 0;
3852
3853out_dput_d2:
3854 dput(d2);
3855out_dput_d1:
3856 dput(d1);
3857out_unlock:
3858 unlock_rename(rd->old_parent, rd->new_parent);
3859 return err;
3860}
3861
3862/**
3863 * start_renaming - lookup and lock names for rename with permission checking
3864 * @rd: rename data containing parents and flags, and
3865 * for receiving found dentries
3866 * @lookup_flags: extra flags to pass to ->lookup (e.g. LOOKUP_REVAL,
3867 * LOOKUP_NO_SYMLINKS etc).
3868 * @old_last: name of object in @rd.old_parent
3869 * @new_last: name of object in @rd.new_parent
3870 *
3871 * Look up two names and ensure locks are in place for
3872 * rename.
3873 *
3874 * On success the found dentries are stored in @rd.old_dentry,
3875 * @rd.new_dentry. Also the refcount on @rd->old_parent is increased.
3876 * These references and the lock are dropped by end_renaming().
3877 *
3878 * The passed in qstrs need not have the hash calculated, and basic
3879 * eXecute permission checking is performed against @rd.mnt_idmap.
3880 *
3881 * Returns: zero or an error.
3882 */
3883int start_renaming(struct renamedata *rd, int lookup_flags,
3884 struct qstr *old_last, struct qstr *new_last)
3885{
3886 int err;
3887
3888 err = lookup_one_common(rd->mnt_idmap, old_last, rd->old_parent);
3889 if (err)
3890 return err;
3891 err = lookup_one_common(rd->mnt_idmap, new_last, rd->new_parent);
3892 if (err)
3893 return err;
3894 return __start_renaming(rd, lookup_flags, old_last, new_last);
3895}
3896EXPORT_SYMBOL(start_renaming);
3897
3898static int
3899__start_renaming_dentry(struct renamedata *rd, int lookup_flags,
3900 struct dentry *old_dentry, struct qstr *new_last)
3901{
3902 struct dentry *trap;
3903 struct dentry *d2;
3904 int target_flags = LOOKUP_RENAME_TARGET | LOOKUP_CREATE;
3905 int err;
3906
3907 if (rd->flags & RENAME_EXCHANGE)
3908 target_flags = 0;
3909 if (rd->flags & RENAME_NOREPLACE)
3910 target_flags |= LOOKUP_EXCL;
3911
3912 /* Already have the dentry - need to be sure to lock the correct parent */
3913 trap = lock_rename_child(old_dentry, rd->new_parent);
3914 if (IS_ERR(trap))
3915 return PTR_ERR(trap);
3916 if (d_unhashed(old_dentry) ||
3917 (rd->old_parent && rd->old_parent != old_dentry->d_parent)) {
3918 /* dentry was removed, or moved and explicit parent requested */
3919 err = -EINVAL;
3920 goto out_unlock;
3921 }
3922
3923 d2 = lookup_one_qstr_excl(new_last, rd->new_parent,
3924 lookup_flags | target_flags);
3925 err = PTR_ERR(d2);
3926 if (IS_ERR(d2))
3927 goto out_unlock;
3928
3929 if (old_dentry == trap) {
3930 /* source is an ancestor of target */
3931 err = -EINVAL;
3932 goto out_dput_d2;
3933 }
3934
3935 if (d2 == trap) {
3936 /* target is an ancestor of source */
3937 if (rd->flags & RENAME_EXCHANGE)
3938 err = -EINVAL;
3939 else
3940 err = -ENOTEMPTY;
3941 goto out_dput_d2;
3942 }
3943
3944 rd->old_dentry = dget(old_dentry);
3945 rd->new_dentry = d2;
3946 rd->old_parent = dget(old_dentry->d_parent);
3947 return 0;
3948
3949out_dput_d2:
3950 dput(d2);
3951out_unlock:
3952 unlock_rename(old_dentry->d_parent, rd->new_parent);
3953 return err;
3954}
3955
3956/**
3957 * start_renaming_dentry - lookup and lock name for rename with permission checking
3958 * @rd: rename data containing parents and flags, and
3959 * for receiving found dentries
3960 * @lookup_flags: extra flags to pass to ->lookup (e.g. LOOKUP_REVAL,
3961 * LOOKUP_NO_SYMLINKS etc).
3962 * @old_dentry: dentry of name to move
3963 * @new_last: name of target in @rd.new_parent
3964 *
3965 * Look up target name and ensure locks are in place for
3966 * rename.
3967 *
3968 * On success the found dentry is stored in @rd.new_dentry and
3969 * @rd.old_parent is confirmed to be the parent of @old_dentry. If it
3970 * was originally %NULL, it is set. In either case a reference is taken
3971 * so that end_renaming() can have a stable reference to unlock.
3972 *
3973 * References and the lock can be dropped with end_renaming()
3974 *
3975 * The passed in qstr need not have the hash calculated, and basic
3976 * eXecute permission checking is performed against @rd.mnt_idmap.
3977 *
3978 * Returns: zero or an error.
3979 */
3980int start_renaming_dentry(struct renamedata *rd, int lookup_flags,
3981 struct dentry *old_dentry, struct qstr *new_last)
3982{
3983 int err;
3984
3985 err = lookup_one_common(rd->mnt_idmap, new_last, rd->new_parent);
3986 if (err)
3987 return err;
3988 return __start_renaming_dentry(rd, lookup_flags, old_dentry, new_last);
3989}
3990EXPORT_SYMBOL(start_renaming_dentry);
3991
3992/**
3993 * start_renaming_two_dentries - Lock to dentries in given parents for rename
3994 * @rd: rename data containing parent
3995 * @old_dentry: dentry of name to move
3996 * @new_dentry: dentry to move to
3997 *
3998 * Ensure locks are in place for rename and check parentage is still correct.
3999 *
4000 * On success the two dentries are stored in @rd.old_dentry and
4001 * @rd.new_dentry and @rd.old_parent and @rd.new_parent are confirmed to
4002 * be the parents of the dentries.
4003 *
4004 * References and the lock can be dropped with end_renaming()
4005 *
4006 * Returns: zero or an error.
4007 */
4008int
4009start_renaming_two_dentries(struct renamedata *rd,
4010 struct dentry *old_dentry, struct dentry *new_dentry)
4011{
4012 struct dentry *trap;
4013 int err;
4014
4015 /* Already have the dentry - need to be sure to lock the correct parent */
4016 trap = lock_rename_child(old_dentry, rd->new_parent);
4017 if (IS_ERR(trap))
4018 return PTR_ERR(trap);
4019 err = -EINVAL;
4020 if (d_unhashed(old_dentry) ||
4021 (rd->old_parent && rd->old_parent != old_dentry->d_parent))
4022 /* old_dentry was removed, or moved and explicit parent requested */
4023 goto out_unlock;
4024 if (d_unhashed(new_dentry) ||
4025 rd->new_parent != new_dentry->d_parent)
4026 /* new_dentry was removed or moved */
4027 goto out_unlock;
4028
4029 if (old_dentry == trap)
4030 /* source is an ancestor of target */
4031 goto out_unlock;
4032
4033 if (new_dentry == trap) {
4034 /* target is an ancestor of source */
4035 if (rd->flags & RENAME_EXCHANGE)
4036 err = -EINVAL;
4037 else
4038 err = -ENOTEMPTY;
4039 goto out_unlock;
4040 }
4041
4042 err = -EEXIST;
4043 if (d_is_positive(new_dentry) && (rd->flags & RENAME_NOREPLACE))
4044 goto out_unlock;
4045
4046 rd->old_dentry = dget(old_dentry);
4047 rd->new_dentry = dget(new_dentry);
4048 rd->old_parent = dget(old_dentry->d_parent);
4049 return 0;
4050
4051out_unlock:
4052 unlock_rename(old_dentry->d_parent, rd->new_parent);
4053 return err;
4054}
4055EXPORT_SYMBOL(start_renaming_two_dentries);
4056
4057void end_renaming(struct renamedata *rd)
4058{
4059 unlock_rename(rd->old_parent, rd->new_parent);
4060 dput(rd->old_dentry);
4061 dput(rd->new_dentry);
4062 dput(rd->old_parent);
4063}
4064EXPORT_SYMBOL(end_renaming);
4065
4066/**
4067 * vfs_prepare_mode - prepare the mode to be used for a new inode
4068 * @idmap: idmap of the mount the inode was found from
4069 * @dir: parent directory of the new inode
4070 * @mode: mode of the new inode
4071 * @mask_perms: allowed permission by the vfs
4072 * @type: type of file to be created
4073 *
4074 * This helper consolidates and enforces vfs restrictions on the @mode of a new
4075 * object to be created.
4076 *
4077 * Umask stripping depends on whether the filesystem supports POSIX ACLs (see
4078 * the kernel documentation for mode_strip_umask()). Moving umask stripping
4079 * after setgid stripping allows the same ordering for both non-POSIX ACL and
4080 * POSIX ACL supporting filesystems.
4081 *
4082 * Note that it's currently valid for @type to be 0 if a directory is created.
4083 * Filesystems raise that flag individually and we need to check whether each
4084 * filesystem can deal with receiving S_IFDIR from the vfs before we enforce a
4085 * non-zero type.
4086 *
4087 * Returns: mode to be passed to the filesystem
4088 */
4089static inline umode_t vfs_prepare_mode(struct mnt_idmap *idmap,
4090 const struct inode *dir, umode_t mode,
4091 umode_t mask_perms, umode_t type)
4092{
4093 mode = mode_strip_sgid(idmap, dir, mode);
4094 mode = mode_strip_umask(dir, mode);
4095
4096 /*
4097 * Apply the vfs mandated allowed permission mask and set the type of
4098 * file to be created before we call into the filesystem.
4099 */
4100 mode &= (mask_perms & ~S_IFMT);
4101 mode |= (type & S_IFMT);
4102
4103 return mode;
4104}
4105
4106/**
4107 * vfs_create - create new file
4108 * @idmap: idmap of the mount the inode was found from
4109 * @dentry: dentry of the child file
4110 * @mode: mode of the child file
4111 * @di: returns parent inode, if the inode is delegated.
4112 *
4113 * Create a new file.
4114 *
4115 * If the inode has been found through an idmapped mount the idmap of
4116 * the vfsmount must be passed through @idmap. This function will then take
4117 * care to map the inode according to @idmap before checking permissions.
4118 * On non-idmapped mounts or if permission checking is to be performed on the
4119 * raw inode simply pass @nop_mnt_idmap.
4120 */
4121int vfs_create(struct mnt_idmap *idmap, struct dentry *dentry, umode_t mode,
4122 struct delegated_inode *di)
4123{
4124 struct inode *dir = d_inode(dentry->d_parent);
4125 int error;
4126
4127 error = may_create(idmap, dir, dentry);
4128 if (error)
4129 return error;
4130
4131 if (!dir->i_op->create)
4132 return -EACCES; /* shouldn't it be ENOSYS? */
4133
4134 mode = vfs_prepare_mode(idmap, dir, mode, S_IALLUGO, S_IFREG);
4135 error = security_inode_create(dir, dentry, mode);
4136 if (error)
4137 return error;
4138 error = try_break_deleg(dir, di);
4139 if (error)
4140 return error;
4141 error = dir->i_op->create(idmap, dir, dentry, mode, true);
4142 if (!error)
4143 fsnotify_create(dir, dentry);
4144 return error;
4145}
4146EXPORT_SYMBOL(vfs_create);
4147
4148int vfs_mkobj(struct dentry *dentry, umode_t mode,
4149 int (*f)(struct dentry *, umode_t, void *),
4150 void *arg)
4151{
4152 struct inode *dir = dentry->d_parent->d_inode;
4153 int error = may_create(&nop_mnt_idmap, dir, dentry);
4154 if (error)
4155 return error;
4156
4157 mode &= S_IALLUGO;
4158 mode |= S_IFREG;
4159 error = security_inode_create(dir, dentry, mode);
4160 if (error)
4161 return error;
4162 error = f(dentry, mode, arg);
4163 if (!error)
4164 fsnotify_create(dir, dentry);
4165 return error;
4166}
4167EXPORT_SYMBOL(vfs_mkobj);
4168
4169bool may_open_dev(const struct path *path)
4170{
4171 return !(path->mnt->mnt_flags & MNT_NODEV) &&
4172 !(path->mnt->mnt_sb->s_iflags & SB_I_NODEV);
4173}
4174
4175static int may_open(struct mnt_idmap *idmap, const struct path *path,
4176 int acc_mode, int flag)
4177{
4178 struct dentry *dentry = path->dentry;
4179 struct inode *inode = dentry->d_inode;
4180 int error;
4181
4182 if (!inode)
4183 return -ENOENT;
4184
4185 switch (inode->i_mode & S_IFMT) {
4186 case S_IFLNK:
4187 return -ELOOP;
4188 case S_IFDIR:
4189 if (acc_mode & MAY_WRITE)
4190 return -EISDIR;
4191 if (acc_mode & MAY_EXEC)
4192 return -EACCES;
4193 break;
4194 case S_IFBLK:
4195 case S_IFCHR:
4196 if (!may_open_dev(path))
4197 return -EACCES;
4198 fallthrough;
4199 case S_IFIFO:
4200 case S_IFSOCK:
4201 if (acc_mode & MAY_EXEC)
4202 return -EACCES;
4203 flag &= ~O_TRUNC;
4204 break;
4205 case S_IFREG:
4206 if ((acc_mode & MAY_EXEC) && path_noexec(path))
4207 return -EACCES;
4208 break;
4209 default:
4210 VFS_BUG_ON_INODE(!IS_ANON_FILE(inode), inode);
4211 }
4212
4213 error = inode_permission(idmap, inode, MAY_OPEN | acc_mode);
4214 if (error)
4215 return error;
4216
4217 /*
4218 * An append-only file must be opened in append mode for writing.
4219 */
4220 if (IS_APPEND(inode)) {
4221 if ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND))
4222 return -EPERM;
4223 if (flag & O_TRUNC)
4224 return -EPERM;
4225 }
4226
4227 /* O_NOATIME can only be set by the owner or superuser */
4228 if (flag & O_NOATIME && !inode_owner_or_capable(idmap, inode))
4229 return -EPERM;
4230
4231 return 0;
4232}
4233
4234static int handle_truncate(struct mnt_idmap *idmap, struct file *filp)
4235{
4236 const struct path *path = &filp->f_path;
4237 struct inode *inode = path->dentry->d_inode;
4238 int error = get_write_access(inode);
4239 if (error)
4240 return error;
4241
4242 error = security_file_truncate(filp);
4243 if (!error) {
4244 error = do_truncate(idmap, path->dentry, 0,
4245 ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
4246 filp);
4247 }
4248 put_write_access(inode);
4249 return error;
4250}
4251
4252static inline int open_to_namei_flags(int flag)
4253{
4254 if ((flag & O_ACCMODE) == 3)
4255 flag--;
4256 return flag;
4257}
4258
4259static int may_o_create(struct mnt_idmap *idmap,
4260 const struct path *dir, struct dentry *dentry,
4261 umode_t mode)
4262{
4263 int error = security_path_mknod(dir, dentry, mode, 0);
4264 if (error)
4265 return error;
4266
4267 if (!fsuidgid_has_mapping(dir->dentry->d_sb, idmap))
4268 return -EOVERFLOW;
4269
4270 error = inode_permission(idmap, dir->dentry->d_inode,
4271 MAY_WRITE | MAY_EXEC);
4272 if (error)
4273 return error;
4274
4275 return security_inode_create(dir->dentry->d_inode, dentry, mode);
4276}
4277
4278/*
4279 * Attempt to atomically look up, create and open a file from a negative
4280 * dentry.
4281 *
4282 * Returns 0 if successful. The file will have been created and attached to
4283 * @file by the filesystem calling finish_open().
4284 *
4285 * If the file was looked up only or didn't need creating, FMODE_OPENED won't
4286 * be set. The caller will need to perform the open themselves. @path will
4287 * have been updated to point to the new dentry. This may be negative.
4288 *
4289 * Returns an error code otherwise.
4290 */
4291static struct dentry *atomic_open(struct nameidata *nd, struct dentry *dentry,
4292 struct file *file,
4293 int open_flag, umode_t mode)
4294{
4295 struct dentry *const DENTRY_NOT_SET = (void *) -1UL;
4296 struct inode *dir = nd->path.dentry->d_inode;
4297 int error;
4298
4299 if (nd->flags & LOOKUP_DIRECTORY)
4300 open_flag |= O_DIRECTORY;
4301
4302 file->__f_path.dentry = DENTRY_NOT_SET;
4303 file->__f_path.mnt = nd->path.mnt;
4304 error = dir->i_op->atomic_open(dir, dentry, file,
4305 open_to_namei_flags(open_flag), mode);
4306 d_lookup_done(dentry);
4307 if (!error) {
4308 if (file->f_mode & FMODE_OPENED) {
4309 if (unlikely(dentry != file->f_path.dentry)) {
4310 dput(dentry);
4311 dentry = dget(file->f_path.dentry);
4312 }
4313 } else if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) {
4314 error = -EIO;
4315 } else {
4316 if (file->f_path.dentry) {
4317 dput(dentry);
4318 dentry = file->f_path.dentry;
4319 }
4320 if (unlikely(d_is_negative(dentry)))
4321 error = -ENOENT;
4322 }
4323 }
4324 if (error) {
4325 dput(dentry);
4326 dentry = ERR_PTR(error);
4327 }
4328 return dentry;
4329}
4330
4331/*
4332 * Look up and maybe create and open the last component.
4333 *
4334 * Must be called with parent locked (exclusive in O_CREAT case).
4335 *
4336 * Returns 0 on success, that is, if
4337 * the file was successfully atomically created (if necessary) and opened, or
4338 * the file was not completely opened at this time, though lookups and
4339 * creations were performed.
4340 * These case are distinguished by presence of FMODE_OPENED on file->f_mode.
4341 * In the latter case dentry returned in @path might be negative if O_CREAT
4342 * hadn't been specified.
4343 *
4344 * An error code is returned on failure.
4345 */
4346static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
4347 const struct open_flags *op,
4348 bool got_write, struct delegated_inode *delegated_inode)
4349{
4350 struct mnt_idmap *idmap;
4351 struct dentry *dir = nd->path.dentry;
4352 struct inode *dir_inode = dir->d_inode;
4353 int open_flag = op->open_flag;
4354 struct dentry *dentry;
4355 int error, create_error = 0;
4356 umode_t mode = op->mode;
4357 DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
4358
4359 if (unlikely(IS_DEADDIR(dir_inode)))
4360 return ERR_PTR(-ENOENT);
4361
4362 file->f_mode &= ~FMODE_CREATED;
4363 dentry = d_lookup(dir, &nd->last);
4364 for (;;) {
4365 if (!dentry) {
4366 dentry = d_alloc_parallel(dir, &nd->last, &wq);
4367 if (IS_ERR(dentry))
4368 return dentry;
4369 }
4370 if (d_in_lookup(dentry))
4371 break;
4372
4373 error = d_revalidate(dir_inode, &nd->last, dentry, nd->flags);
4374 if (likely(error > 0))
4375 break;
4376 if (error)
4377 goto out_dput;
4378 d_invalidate(dentry);
4379 dput(dentry);
4380 dentry = NULL;
4381 }
4382 if (dentry->d_inode) {
4383 /* Cached positive dentry: will open in f_op->open */
4384 return dentry;
4385 }
4386
4387 if (open_flag & O_CREAT)
4388 audit_inode(nd->name, dir, AUDIT_INODE_PARENT);
4389
4390 /*
4391 * Checking write permission is tricky, bacuse we don't know if we are
4392 * going to actually need it: O_CREAT opens should work as long as the
4393 * file exists. But checking existence breaks atomicity. The trick is
4394 * to check access and if not granted clear O_CREAT from the flags.
4395 *
4396 * Another problem is returing the "right" error value (e.g. for an
4397 * O_EXCL open we want to return EEXIST not EROFS).
4398 */
4399 if (unlikely(!got_write))
4400 open_flag &= ~O_TRUNC;
4401 idmap = mnt_idmap(nd->path.mnt);
4402 if (open_flag & O_CREAT) {
4403 if (open_flag & O_EXCL)
4404 open_flag &= ~O_TRUNC;
4405 mode = vfs_prepare_mode(idmap, dir->d_inode, mode, mode, mode);
4406 if (likely(got_write))
4407 create_error = may_o_create(idmap, &nd->path,
4408 dentry, mode);
4409 else
4410 create_error = -EROFS;
4411 }
4412 if (create_error)
4413 open_flag &= ~O_CREAT;
4414 if (dir_inode->i_op->atomic_open) {
4415 dentry = atomic_open(nd, dentry, file, open_flag, mode);
4416 if (unlikely(create_error) && dentry == ERR_PTR(-ENOENT))
4417 dentry = ERR_PTR(create_error);
4418 return dentry;
4419 }
4420
4421 if (d_in_lookup(dentry)) {
4422 struct dentry *res = dir_inode->i_op->lookup(dir_inode, dentry,
4423 nd->flags);
4424 d_lookup_done(dentry);
4425 if (unlikely(res)) {
4426 if (IS_ERR(res)) {
4427 error = PTR_ERR(res);
4428 goto out_dput;
4429 }
4430 dput(dentry);
4431 dentry = res;
4432 }
4433 }
4434
4435 /* Negative dentry, just create the file */
4436 if (!dentry->d_inode && (open_flag & O_CREAT)) {
4437 /* but break the directory lease first! */
4438 error = try_break_deleg(dir_inode, delegated_inode);
4439 if (error)
4440 goto out_dput;
4441
4442 file->f_mode |= FMODE_CREATED;
4443 audit_inode_child(dir_inode, dentry, AUDIT_TYPE_CHILD_CREATE);
4444 if (!dir_inode->i_op->create) {
4445 error = -EACCES;
4446 goto out_dput;
4447 }
4448
4449 error = dir_inode->i_op->create(idmap, dir_inode, dentry,
4450 mode, open_flag & O_EXCL);
4451 if (error)
4452 goto out_dput;
4453 }
4454 if (unlikely(create_error) && !dentry->d_inode) {
4455 error = create_error;
4456 goto out_dput;
4457 }
4458 return dentry;
4459
4460out_dput:
4461 dput(dentry);
4462 return ERR_PTR(error);
4463}
4464
4465static inline bool trailing_slashes(struct nameidata *nd)
4466{
4467 return (bool)nd->last.name[nd->last.len];
4468}
4469
4470static struct dentry *lookup_fast_for_open(struct nameidata *nd, int open_flag)
4471{
4472 struct dentry *dentry;
4473
4474 if (open_flag & O_CREAT) {
4475 if (trailing_slashes(nd))
4476 return ERR_PTR(-EISDIR);
4477
4478 /* Don't bother on an O_EXCL create */
4479 if (open_flag & O_EXCL)
4480 return NULL;
4481 }
4482
4483 if (trailing_slashes(nd))
4484 nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
4485
4486 dentry = lookup_fast(nd);
4487 if (IS_ERR_OR_NULL(dentry))
4488 return dentry;
4489
4490 if (open_flag & O_CREAT) {
4491 /* Discard negative dentries. Need inode_lock to do the create */
4492 if (!dentry->d_inode) {
4493 if (!(nd->flags & LOOKUP_RCU))
4494 dput(dentry);
4495 dentry = NULL;
4496 }
4497 }
4498 return dentry;
4499}
4500
4501static const char *open_last_lookups(struct nameidata *nd,
4502 struct file *file, const struct open_flags *op)
4503{
4504 struct delegated_inode delegated_inode = { };
4505 struct dentry *dir = nd->path.dentry;
4506 int open_flag = op->open_flag;
4507 bool got_write = false;
4508 struct dentry *dentry;
4509 const char *res;
4510
4511 nd->flags |= op->intent;
4512
4513 if (nd->last_type != LAST_NORM) {
4514 if (nd->depth)
4515 put_link(nd);
4516 return handle_dots(nd, nd->last_type);
4517 }
4518
4519 /* We _can_ be in RCU mode here */
4520 dentry = lookup_fast_for_open(nd, open_flag);
4521 if (IS_ERR(dentry))
4522 return ERR_CAST(dentry);
4523
4524 if (likely(dentry))
4525 goto finish_lookup;
4526
4527 if (!(open_flag & O_CREAT)) {
4528 if (WARN_ON_ONCE(nd->flags & LOOKUP_RCU))
4529 return ERR_PTR(-ECHILD);
4530 } else {
4531 if (nd->flags & LOOKUP_RCU) {
4532 if (!try_to_unlazy(nd))
4533 return ERR_PTR(-ECHILD);
4534 }
4535 }
4536retry:
4537 if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) {
4538 got_write = !mnt_want_write(nd->path.mnt);
4539 /*
4540 * do _not_ fail yet - we might not need that or fail with
4541 * a different error; let lookup_open() decide; we'll be
4542 * dropping this one anyway.
4543 */
4544 }
4545 if (open_flag & O_CREAT)
4546 inode_lock(dir->d_inode);
4547 else
4548 inode_lock_shared(dir->d_inode);
4549 dentry = lookup_open(nd, file, op, got_write, &delegated_inode);
4550 if (!IS_ERR(dentry)) {
4551 if (file->f_mode & FMODE_CREATED)
4552 fsnotify_create(dir->d_inode, dentry);
4553 if (file->f_mode & FMODE_OPENED)
4554 fsnotify_open(file);
4555 }
4556 if (open_flag & O_CREAT)
4557 inode_unlock(dir->d_inode);
4558 else
4559 inode_unlock_shared(dir->d_inode);
4560
4561 if (got_write)
4562 mnt_drop_write(nd->path.mnt);
4563
4564 if (IS_ERR(dentry)) {
4565 if (is_delegated(&delegated_inode)) {
4566 int error = break_deleg_wait(&delegated_inode);
4567
4568 if (!error)
4569 goto retry;
4570 return ERR_PTR(error);
4571 }
4572 return ERR_CAST(dentry);
4573 }
4574
4575 if (file->f_mode & (FMODE_OPENED | FMODE_CREATED)) {
4576 dput(nd->path.dentry);
4577 nd->path.dentry = dentry;
4578 return NULL;
4579 }
4580
4581finish_lookup:
4582 if (nd->depth)
4583 put_link(nd);
4584 res = step_into(nd, WALK_TRAILING, dentry);
4585 if (unlikely(res))
4586 nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
4587 return res;
4588}
4589
4590/*
4591 * Handle the last step of open()
4592 */
4593static int do_open(struct nameidata *nd,
4594 struct file *file, const struct open_flags *op)
4595{
4596 struct mnt_idmap *idmap;
4597 int open_flag = op->open_flag;
4598 bool do_truncate;
4599 int acc_mode;
4600 int error;
4601
4602 if (!(file->f_mode & (FMODE_OPENED | FMODE_CREATED))) {
4603 error = complete_walk(nd);
4604 if (error)
4605 return error;
4606 }
4607 if (!(file->f_mode & FMODE_CREATED))
4608 audit_inode(nd->name, nd->path.dentry, 0);
4609 idmap = mnt_idmap(nd->path.mnt);
4610 if (open_flag & O_CREAT) {
4611 if ((open_flag & O_EXCL) && !(file->f_mode & FMODE_CREATED))
4612 return -EEXIST;
4613 if (d_is_dir(nd->path.dentry))
4614 return -EISDIR;
4615 error = may_create_in_sticky(idmap, nd,
4616 d_backing_inode(nd->path.dentry));
4617 if (unlikely(error))
4618 return error;
4619 }
4620 if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry))
4621 return -ENOTDIR;
4622
4623 do_truncate = false;
4624 acc_mode = op->acc_mode;
4625 if (file->f_mode & FMODE_CREATED) {
4626 /* Don't check for write permission, don't truncate */
4627 open_flag &= ~O_TRUNC;
4628 acc_mode = 0;
4629 } else if (d_is_reg(nd->path.dentry) && open_flag & O_TRUNC) {
4630 error = mnt_want_write(nd->path.mnt);
4631 if (error)
4632 return error;
4633 do_truncate = true;
4634 }
4635 error = may_open(idmap, &nd->path, acc_mode, open_flag);
4636 if (!error && !(file->f_mode & FMODE_OPENED))
4637 error = vfs_open(&nd->path, file);
4638 if (!error)
4639 error = security_file_post_open(file, op->acc_mode);
4640 if (!error && do_truncate)
4641 error = handle_truncate(idmap, file);
4642 if (unlikely(error > 0)) {
4643 WARN_ON(1);
4644 error = -EINVAL;
4645 }
4646 if (do_truncate)
4647 mnt_drop_write(nd->path.mnt);
4648 return error;
4649}
4650
4651/**
4652 * vfs_tmpfile - create tmpfile
4653 * @idmap: idmap of the mount the inode was found from
4654 * @parentpath: pointer to the path of the base directory
4655 * @file: file descriptor of the new tmpfile
4656 * @mode: mode of the new tmpfile
4657 *
4658 * Create a temporary file.
4659 *
4660 * If the inode has been found through an idmapped mount the idmap of
4661 * the vfsmount must be passed through @idmap. This function will then take
4662 * care to map the inode according to @idmap before checking permissions.
4663 * On non-idmapped mounts or if permission checking is to be performed on the
4664 * raw inode simply pass @nop_mnt_idmap.
4665 */
4666int vfs_tmpfile(struct mnt_idmap *idmap,
4667 const struct path *parentpath,
4668 struct file *file, umode_t mode)
4669{
4670 struct dentry *child;
4671 struct inode *dir = d_inode(parentpath->dentry);
4672 struct inode *inode;
4673 int error;
4674 int open_flag = file->f_flags;
4675
4676 /* we want directory to be writable */
4677 error = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC);
4678 if (error)
4679 return error;
4680 if (!dir->i_op->tmpfile)
4681 return -EOPNOTSUPP;
4682 child = d_alloc(parentpath->dentry, &slash_name);
4683 if (unlikely(!child))
4684 return -ENOMEM;
4685 file->__f_path.mnt = parentpath->mnt;
4686 file->__f_path.dentry = child;
4687 mode = vfs_prepare_mode(idmap, dir, mode, mode, mode);
4688 error = dir->i_op->tmpfile(idmap, dir, file, mode);
4689 dput(child);
4690 if (file->f_mode & FMODE_OPENED)
4691 fsnotify_open(file);
4692 if (error)
4693 return error;
4694 /* Don't check for other permissions, the inode was just created */
4695 error = may_open(idmap, &file->f_path, 0, file->f_flags);
4696 if (error)
4697 return error;
4698 inode = file_inode(file);
4699 if (!(open_flag & O_EXCL)) {
4700 spin_lock(&inode->i_lock);
4701 inode_state_set(inode, I_LINKABLE);
4702 spin_unlock(&inode->i_lock);
4703 }
4704 security_inode_post_create_tmpfile(idmap, inode);
4705 return 0;
4706}
4707
4708/**
4709 * kernel_tmpfile_open - open a tmpfile for kernel internal use
4710 * @idmap: idmap of the mount the inode was found from
4711 * @parentpath: path of the base directory
4712 * @mode: mode of the new tmpfile
4713 * @open_flag: flags
4714 * @cred: credentials for open
4715 *
4716 * Create and open a temporary file. The file is not accounted in nr_files,
4717 * hence this is only for kernel internal use, and must not be installed into
4718 * file tables or such.
4719 */
4720struct file *kernel_tmpfile_open(struct mnt_idmap *idmap,
4721 const struct path *parentpath,
4722 umode_t mode, int open_flag,
4723 const struct cred *cred)
4724{
4725 struct file *file;
4726 int error;
4727
4728 file = alloc_empty_file_noaccount(open_flag, cred);
4729 if (IS_ERR(file))
4730 return file;
4731
4732 error = vfs_tmpfile(idmap, parentpath, file, mode);
4733 if (error) {
4734 fput(file);
4735 file = ERR_PTR(error);
4736 }
4737 return file;
4738}
4739EXPORT_SYMBOL(kernel_tmpfile_open);
4740
4741static int do_tmpfile(struct nameidata *nd, unsigned flags,
4742 const struct open_flags *op,
4743 struct file *file)
4744{
4745 struct path path;
4746 int error = path_lookupat(nd, flags | LOOKUP_DIRECTORY, &path);
4747
4748 if (unlikely(error))
4749 return error;
4750 error = mnt_want_write(path.mnt);
4751 if (unlikely(error))
4752 goto out;
4753 error = vfs_tmpfile(mnt_idmap(path.mnt), &path, file, op->mode);
4754 if (error)
4755 goto out2;
4756 audit_inode(nd->name, file->f_path.dentry, 0);
4757out2:
4758 mnt_drop_write(path.mnt);
4759out:
4760 path_put(&path);
4761 return error;
4762}
4763
4764static int do_o_path(struct nameidata *nd, unsigned flags, struct file *file)
4765{
4766 struct path path;
4767 int error = path_lookupat(nd, flags, &path);
4768 if (!error) {
4769 audit_inode(nd->name, path.dentry, 0);
4770 error = vfs_open(&path, file);
4771 path_put(&path);
4772 }
4773 return error;
4774}
4775
4776static struct file *path_openat(struct nameidata *nd,
4777 const struct open_flags *op, unsigned flags)
4778{
4779 struct file *file;
4780 int error;
4781
4782 file = alloc_empty_file(op->open_flag, current_cred());
4783 if (IS_ERR(file))
4784 return file;
4785
4786 if (unlikely(file->f_flags & __O_TMPFILE)) {
4787 error = do_tmpfile(nd, flags, op, file);
4788 } else if (unlikely(file->f_flags & O_PATH)) {
4789 error = do_o_path(nd, flags, file);
4790 } else {
4791 const char *s = path_init(nd, flags);
4792 while (!(error = link_path_walk(s, nd)) &&
4793 (s = open_last_lookups(nd, file, op)) != NULL)
4794 ;
4795 if (!error)
4796 error = do_open(nd, file, op);
4797 terminate_walk(nd);
4798 }
4799 if (likely(!error)) {
4800 if (likely(file->f_mode & FMODE_OPENED))
4801 return file;
4802 WARN_ON(1);
4803 error = -EINVAL;
4804 }
4805 fput_close(file);
4806 if (error == -EOPENSTALE) {
4807 if (flags & LOOKUP_RCU)
4808 error = -ECHILD;
4809 else
4810 error = -ESTALE;
4811 }
4812 return ERR_PTR(error);
4813}
4814
4815struct file *do_filp_open(int dfd, struct filename *pathname,
4816 const struct open_flags *op)
4817{
4818 struct nameidata nd;
4819 int flags = op->lookup_flags;
4820 struct file *filp;
4821
4822 set_nameidata(&nd, dfd, pathname, NULL);
4823 filp = path_openat(&nd, op, flags | LOOKUP_RCU);
4824 if (unlikely(filp == ERR_PTR(-ECHILD)))
4825 filp = path_openat(&nd, op, flags);
4826 if (unlikely(filp == ERR_PTR(-ESTALE)))
4827 filp = path_openat(&nd, op, flags | LOOKUP_REVAL);
4828 restore_nameidata();
4829 return filp;
4830}
4831
4832struct file *do_file_open_root(const struct path *root,
4833 const char *name, const struct open_flags *op)
4834{
4835 struct nameidata nd;
4836 struct file *file;
4837 struct filename *filename;
4838 int flags = op->lookup_flags;
4839
4840 if (d_is_symlink(root->dentry) && op->intent & LOOKUP_OPEN)
4841 return ERR_PTR(-ELOOP);
4842
4843 filename = getname_kernel(name);
4844 if (IS_ERR(filename))
4845 return ERR_CAST(filename);
4846
4847 set_nameidata(&nd, -1, filename, root);
4848 file = path_openat(&nd, op, flags | LOOKUP_RCU);
4849 if (unlikely(file == ERR_PTR(-ECHILD)))
4850 file = path_openat(&nd, op, flags);
4851 if (unlikely(file == ERR_PTR(-ESTALE)))
4852 file = path_openat(&nd, op, flags | LOOKUP_REVAL);
4853 restore_nameidata();
4854 putname(filename);
4855 return file;
4856}
4857
4858static struct dentry *filename_create(int dfd, struct filename *name,
4859 struct path *path, unsigned int lookup_flags)
4860{
4861 struct dentry *dentry = ERR_PTR(-EEXIST);
4862 struct qstr last;
4863 bool want_dir = lookup_flags & LOOKUP_DIRECTORY;
4864 unsigned int reval_flag = lookup_flags & LOOKUP_REVAL;
4865 unsigned int create_flags = LOOKUP_CREATE | LOOKUP_EXCL;
4866 int type;
4867 int error;
4868
4869 error = filename_parentat(dfd, name, reval_flag, path, &last, &type);
4870 if (error)
4871 return ERR_PTR(error);
4872
4873 /*
4874 * Yucky last component or no last component at all?
4875 * (foo/., foo/.., /////)
4876 */
4877 if (unlikely(type != LAST_NORM))
4878 goto out;
4879
4880 /* don't fail immediately if it's r/o, at least try to report other errors */
4881 error = mnt_want_write(path->mnt);
4882 /*
4883 * Do the final lookup. Suppress 'create' if there is a trailing
4884 * '/', and a directory wasn't requested.
4885 */
4886 if (last.name[last.len] && !want_dir)
4887 create_flags &= ~LOOKUP_CREATE;
4888 dentry = start_dirop(path->dentry, &last, reval_flag | create_flags);
4889 if (IS_ERR(dentry))
4890 goto out_drop_write;
4891
4892 if (unlikely(error))
4893 goto fail;
4894
4895 return dentry;
4896fail:
4897 end_dirop(dentry);
4898 dentry = ERR_PTR(error);
4899out_drop_write:
4900 if (!error)
4901 mnt_drop_write(path->mnt);
4902out:
4903 path_put(path);
4904 return dentry;
4905}
4906
4907struct dentry *start_creating_path(int dfd, const char *pathname,
4908 struct path *path, unsigned int lookup_flags)
4909{
4910 struct filename *filename = getname_kernel(pathname);
4911 struct dentry *res = filename_create(dfd, filename, path, lookup_flags);
4912
4913 putname(filename);
4914 return res;
4915}
4916EXPORT_SYMBOL(start_creating_path);
4917
4918/**
4919 * end_creating_path - finish a code section started by start_creating_path()
4920 * @path: the path instantiated by start_creating_path()
4921 * @dentry: the dentry returned by start_creating_path()
4922 *
4923 * end_creating_path() will unlock and locks taken by start_creating_path()
4924 * and drop an references that were taken. It should only be called
4925 * if start_creating_path() returned a non-error.
4926 * If vfs_mkdir() was called and it returned an error, that error *should*
4927 * be passed to end_creating_path() together with the path.
4928 */
4929void end_creating_path(const struct path *path, struct dentry *dentry)
4930{
4931 end_creating(dentry);
4932 mnt_drop_write(path->mnt);
4933 path_put(path);
4934}
4935EXPORT_SYMBOL(end_creating_path);
4936
4937inline struct dentry *start_creating_user_path(
4938 int dfd, const char __user *pathname,
4939 struct path *path, unsigned int lookup_flags)
4940{
4941 struct filename *filename = getname(pathname);
4942 struct dentry *res = filename_create(dfd, filename, path, lookup_flags);
4943
4944 putname(filename);
4945 return res;
4946}
4947EXPORT_SYMBOL(start_creating_user_path);
4948
4949
4950/**
4951 * vfs_mknod - create device node or file
4952 * @idmap: idmap of the mount the inode was found from
4953 * @dir: inode of the parent directory
4954 * @dentry: dentry of the child device node
4955 * @mode: mode of the child device node
4956 * @dev: device number of device to create
4957 * @delegated_inode: returns parent inode, if the inode is delegated.
4958 *
4959 * Create a device node or file.
4960 *
4961 * If the inode has been found through an idmapped mount the idmap of
4962 * the vfsmount must be passed through @idmap. This function will then take
4963 * care to map the inode according to @idmap before checking permissions.
4964 * On non-idmapped mounts or if permission checking is to be performed on the
4965 * raw inode simply pass @nop_mnt_idmap.
4966 */
4967int vfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
4968 struct dentry *dentry, umode_t mode, dev_t dev,
4969 struct delegated_inode *delegated_inode)
4970{
4971 bool is_whiteout = S_ISCHR(mode) && dev == WHITEOUT_DEV;
4972 int error = may_create(idmap, dir, dentry);
4973
4974 if (error)
4975 return error;
4976
4977 if ((S_ISCHR(mode) || S_ISBLK(mode)) && !is_whiteout &&
4978 !capable(CAP_MKNOD))
4979 return -EPERM;
4980
4981 if (!dir->i_op->mknod)
4982 return -EPERM;
4983
4984 mode = vfs_prepare_mode(idmap, dir, mode, mode, mode);
4985 error = devcgroup_inode_mknod(mode, dev);
4986 if (error)
4987 return error;
4988
4989 error = security_inode_mknod(dir, dentry, mode, dev);
4990 if (error)
4991 return error;
4992
4993 error = try_break_deleg(dir, delegated_inode);
4994 if (error)
4995 return error;
4996
4997 error = dir->i_op->mknod(idmap, dir, dentry, mode, dev);
4998 if (!error)
4999 fsnotify_create(dir, dentry);
5000 return error;
5001}
5002EXPORT_SYMBOL(vfs_mknod);
5003
5004static int may_mknod(umode_t mode)
5005{
5006 switch (mode & S_IFMT) {
5007 case S_IFREG:
5008 case S_IFCHR:
5009 case S_IFBLK:
5010 case S_IFIFO:
5011 case S_IFSOCK:
5012 case 0: /* zero mode translates to S_IFREG */
5013 return 0;
5014 case S_IFDIR:
5015 return -EPERM;
5016 default:
5017 return -EINVAL;
5018 }
5019}
5020
5021static int do_mknodat(int dfd, struct filename *name, umode_t mode,
5022 unsigned int dev)
5023{
5024 struct delegated_inode di = { };
5025 struct mnt_idmap *idmap;
5026 struct dentry *dentry;
5027 struct path path;
5028 int error;
5029 unsigned int lookup_flags = 0;
5030
5031 error = may_mknod(mode);
5032 if (error)
5033 goto out1;
5034retry:
5035 dentry = filename_create(dfd, name, &path, lookup_flags);
5036 error = PTR_ERR(dentry);
5037 if (IS_ERR(dentry))
5038 goto out1;
5039
5040 error = security_path_mknod(&path, dentry,
5041 mode_strip_umask(path.dentry->d_inode, mode), dev);
5042 if (error)
5043 goto out2;
5044
5045 idmap = mnt_idmap(path.mnt);
5046 switch (mode & S_IFMT) {
5047 case 0: case S_IFREG:
5048 error = vfs_create(idmap, dentry, mode, &di);
5049 if (!error)
5050 security_path_post_mknod(idmap, dentry);
5051 break;
5052 case S_IFCHR: case S_IFBLK:
5053 error = vfs_mknod(idmap, path.dentry->d_inode,
5054 dentry, mode, new_decode_dev(dev), &di);
5055 break;
5056 case S_IFIFO: case S_IFSOCK:
5057 error = vfs_mknod(idmap, path.dentry->d_inode,
5058 dentry, mode, 0, &di);
5059 break;
5060 }
5061out2:
5062 end_creating_path(&path, dentry);
5063 if (is_delegated(&di)) {
5064 error = break_deleg_wait(&di);
5065 if (!error)
5066 goto retry;
5067 }
5068 if (retry_estale(error, lookup_flags)) {
5069 lookup_flags |= LOOKUP_REVAL;
5070 goto retry;
5071 }
5072out1:
5073 putname(name);
5074 return error;
5075}
5076
5077SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
5078 unsigned int, dev)
5079{
5080 return do_mknodat(dfd, getname(filename), mode, dev);
5081}
5082
5083SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, dev)
5084{
5085 return do_mknodat(AT_FDCWD, getname(filename), mode, dev);
5086}
5087
5088/**
5089 * vfs_mkdir - create directory returning correct dentry if possible
5090 * @idmap: idmap of the mount the inode was found from
5091 * @dir: inode of the parent directory
5092 * @dentry: dentry of the child directory
5093 * @mode: mode of the child directory
5094 * @delegated_inode: returns parent inode, if the inode is delegated.
5095 *
5096 * Create a directory.
5097 *
5098 * If the inode has been found through an idmapped mount the idmap of
5099 * the vfsmount must be passed through @idmap. This function will then take
5100 * care to map the inode according to @idmap before checking permissions.
5101 * On non-idmapped mounts or if permission checking is to be performed on the
5102 * raw inode simply pass @nop_mnt_idmap.
5103 *
5104 * In the event that the filesystem does not use the *@dentry but leaves it
5105 * negative or unhashes it and possibly splices a different one returning it,
5106 * the original dentry is dput() and the alternate is returned.
5107 *
5108 * In case of an error the dentry is dput() and an ERR_PTR() is returned.
5109 */
5110struct dentry *vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
5111 struct dentry *dentry, umode_t mode,
5112 struct delegated_inode *delegated_inode)
5113{
5114 int error;
5115 unsigned max_links = dir->i_sb->s_max_links;
5116 struct dentry *de;
5117
5118 error = may_create(idmap, dir, dentry);
5119 if (error)
5120 goto err;
5121
5122 error = -EPERM;
5123 if (!dir->i_op->mkdir)
5124 goto err;
5125
5126 mode = vfs_prepare_mode(idmap, dir, mode, S_IRWXUGO | S_ISVTX, 0);
5127 error = security_inode_mkdir(dir, dentry, mode);
5128 if (error)
5129 goto err;
5130
5131 error = -EMLINK;
5132 if (max_links && dir->i_nlink >= max_links)
5133 goto err;
5134
5135 error = try_break_deleg(dir, delegated_inode);
5136 if (error)
5137 goto err;
5138
5139 de = dir->i_op->mkdir(idmap, dir, dentry, mode);
5140 error = PTR_ERR(de);
5141 if (IS_ERR(de))
5142 goto err;
5143 if (de) {
5144 dput(dentry);
5145 dentry = de;
5146 }
5147 fsnotify_mkdir(dir, dentry);
5148 return dentry;
5149
5150err:
5151 end_creating(dentry);
5152 return ERR_PTR(error);
5153}
5154EXPORT_SYMBOL(vfs_mkdir);
5155
5156int do_mkdirat(int dfd, struct filename *name, umode_t mode)
5157{
5158 struct dentry *dentry;
5159 struct path path;
5160 int error;
5161 unsigned int lookup_flags = LOOKUP_DIRECTORY;
5162 struct delegated_inode delegated_inode = { };
5163
5164retry:
5165 dentry = filename_create(dfd, name, &path, lookup_flags);
5166 error = PTR_ERR(dentry);
5167 if (IS_ERR(dentry))
5168 goto out_putname;
5169
5170 error = security_path_mkdir(&path, dentry,
5171 mode_strip_umask(path.dentry->d_inode, mode));
5172 if (!error) {
5173 dentry = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode,
5174 dentry, mode, &delegated_inode);
5175 if (IS_ERR(dentry))
5176 error = PTR_ERR(dentry);
5177 }
5178 end_creating_path(&path, dentry);
5179 if (is_delegated(&delegated_inode)) {
5180 error = break_deleg_wait(&delegated_inode);
5181 if (!error)
5182 goto retry;
5183 }
5184 if (retry_estale(error, lookup_flags)) {
5185 lookup_flags |= LOOKUP_REVAL;
5186 goto retry;
5187 }
5188out_putname:
5189 putname(name);
5190 return error;
5191}
5192
5193SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
5194{
5195 return do_mkdirat(dfd, getname(pathname), mode);
5196}
5197
5198SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode)
5199{
5200 return do_mkdirat(AT_FDCWD, getname(pathname), mode);
5201}
5202
5203/**
5204 * vfs_rmdir - remove directory
5205 * @idmap: idmap of the mount the inode was found from
5206 * @dir: inode of the parent directory
5207 * @dentry: dentry of the child directory
5208 * @delegated_inode: returns parent inode, if it's delegated.
5209 *
5210 * Remove a directory.
5211 *
5212 * If the inode has been found through an idmapped mount the idmap of
5213 * the vfsmount must be passed through @idmap. This function will then take
5214 * care to map the inode according to @idmap before checking permissions.
5215 * On non-idmapped mounts or if permission checking is to be performed on the
5216 * raw inode simply pass @nop_mnt_idmap.
5217 */
5218int vfs_rmdir(struct mnt_idmap *idmap, struct inode *dir,
5219 struct dentry *dentry, struct delegated_inode *delegated_inode)
5220{
5221 int error = may_delete(idmap, dir, dentry, 1);
5222
5223 if (error)
5224 return error;
5225
5226 if (!dir->i_op->rmdir)
5227 return -EPERM;
5228
5229 dget(dentry);
5230 inode_lock(dentry->d_inode);
5231
5232 error = -EBUSY;
5233 if (is_local_mountpoint(dentry) ||
5234 (dentry->d_inode->i_flags & S_KERNEL_FILE))
5235 goto out;
5236
5237 error = security_inode_rmdir(dir, dentry);
5238 if (error)
5239 goto out;
5240
5241 error = try_break_deleg(dir, delegated_inode);
5242 if (error)
5243 goto out;
5244
5245 error = dir->i_op->rmdir(dir, dentry);
5246 if (error)
5247 goto out;
5248
5249 shrink_dcache_parent(dentry);
5250 dentry->d_inode->i_flags |= S_DEAD;
5251 dont_mount(dentry);
5252 detach_mounts(dentry);
5253
5254out:
5255 inode_unlock(dentry->d_inode);
5256 dput(dentry);
5257 if (!error)
5258 d_delete_notify(dir, dentry);
5259 return error;
5260}
5261EXPORT_SYMBOL(vfs_rmdir);
5262
5263int do_rmdir(int dfd, struct filename *name)
5264{
5265 int error;
5266 struct dentry *dentry;
5267 struct path path;
5268 struct qstr last;
5269 int type;
5270 unsigned int lookup_flags = 0;
5271 struct delegated_inode delegated_inode = { };
5272retry:
5273 error = filename_parentat(dfd, name, lookup_flags, &path, &last, &type);
5274 if (error)
5275 goto exit1;
5276
5277 switch (type) {
5278 case LAST_DOTDOT:
5279 error = -ENOTEMPTY;
5280 goto exit2;
5281 case LAST_DOT:
5282 error = -EINVAL;
5283 goto exit2;
5284 case LAST_ROOT:
5285 error = -EBUSY;
5286 goto exit2;
5287 }
5288
5289 error = mnt_want_write(path.mnt);
5290 if (error)
5291 goto exit2;
5292
5293 dentry = start_dirop(path.dentry, &last, lookup_flags);
5294 error = PTR_ERR(dentry);
5295 if (IS_ERR(dentry))
5296 goto exit3;
5297 error = security_path_rmdir(&path, dentry);
5298 if (error)
5299 goto exit4;
5300 error = vfs_rmdir(mnt_idmap(path.mnt), path.dentry->d_inode,
5301 dentry, &delegated_inode);
5302exit4:
5303 end_dirop(dentry);
5304exit3:
5305 mnt_drop_write(path.mnt);
5306exit2:
5307 path_put(&path);
5308 if (is_delegated(&delegated_inode)) {
5309 error = break_deleg_wait(&delegated_inode);
5310 if (!error)
5311 goto retry;
5312 }
5313 if (retry_estale(error, lookup_flags)) {
5314 lookup_flags |= LOOKUP_REVAL;
5315 goto retry;
5316 }
5317exit1:
5318 putname(name);
5319 return error;
5320}
5321
5322SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
5323{
5324 return do_rmdir(AT_FDCWD, getname(pathname));
5325}
5326
5327/**
5328 * vfs_unlink - unlink a filesystem object
5329 * @idmap: idmap of the mount the inode was found from
5330 * @dir: parent directory
5331 * @dentry: victim
5332 * @delegated_inode: returns victim inode, if the inode is delegated.
5333 *
5334 * The caller must hold dir->i_rwsem exclusively.
5335 *
5336 * If vfs_unlink discovers a delegation, it will return -EWOULDBLOCK and
5337 * return a reference to the inode in delegated_inode. The caller
5338 * should then break the delegation on that inode and retry. Because
5339 * breaking a delegation may take a long time, the caller should drop
5340 * dir->i_rwsem before doing so.
5341 *
5342 * Alternatively, a caller may pass NULL for delegated_inode. This may
5343 * be appropriate for callers that expect the underlying filesystem not
5344 * to be NFS exported.
5345 *
5346 * If the inode has been found through an idmapped mount the idmap of
5347 * the vfsmount must be passed through @idmap. This function will then take
5348 * care to map the inode according to @idmap before checking permissions.
5349 * On non-idmapped mounts or if permission checking is to be performed on the
5350 * raw inode simply pass @nop_mnt_idmap.
5351 */
5352int vfs_unlink(struct mnt_idmap *idmap, struct inode *dir,
5353 struct dentry *dentry, struct delegated_inode *delegated_inode)
5354{
5355 struct inode *target = dentry->d_inode;
5356 int error = may_delete(idmap, dir, dentry, 0);
5357
5358 if (error)
5359 return error;
5360
5361 if (!dir->i_op->unlink)
5362 return -EPERM;
5363
5364 inode_lock(target);
5365 if (IS_SWAPFILE(target))
5366 error = -EPERM;
5367 else if (is_local_mountpoint(dentry))
5368 error = -EBUSY;
5369 else {
5370 error = security_inode_unlink(dir, dentry);
5371 if (!error) {
5372 error = try_break_deleg(dir, delegated_inode);
5373 if (error)
5374 goto out;
5375 error = try_break_deleg(target, delegated_inode);
5376 if (error)
5377 goto out;
5378 error = dir->i_op->unlink(dir, dentry);
5379 if (!error) {
5380 dont_mount(dentry);
5381 detach_mounts(dentry);
5382 }
5383 }
5384 }
5385out:
5386 inode_unlock(target);
5387
5388 /* We don't d_delete() NFS sillyrenamed files--they still exist. */
5389 if (!error && dentry->d_flags & DCACHE_NFSFS_RENAMED) {
5390 fsnotify_unlink(dir, dentry);
5391 } else if (!error) {
5392 fsnotify_link_count(target);
5393 d_delete_notify(dir, dentry);
5394 }
5395
5396 return error;
5397}
5398EXPORT_SYMBOL(vfs_unlink);
5399
5400/*
5401 * Make sure that the actual truncation of the file will occur outside its
5402 * directory's i_rwsem. Truncate can take a long time if there is a lot of
5403 * writeout happening, and we don't want to prevent access to the directory
5404 * while waiting on the I/O.
5405 */
5406int do_unlinkat(int dfd, struct filename *name)
5407{
5408 int error;
5409 struct dentry *dentry;
5410 struct path path;
5411 struct qstr last;
5412 int type;
5413 struct inode *inode;
5414 struct delegated_inode delegated_inode = { };
5415 unsigned int lookup_flags = 0;
5416retry:
5417 error = filename_parentat(dfd, name, lookup_flags, &path, &last, &type);
5418 if (error)
5419 goto exit_putname;
5420
5421 error = -EISDIR;
5422 if (type != LAST_NORM)
5423 goto exit_path_put;
5424
5425 error = mnt_want_write(path.mnt);
5426 if (error)
5427 goto exit_path_put;
5428retry_deleg:
5429 dentry = start_dirop(path.dentry, &last, lookup_flags);
5430 error = PTR_ERR(dentry);
5431 if (IS_ERR(dentry))
5432 goto exit_drop_write;
5433
5434 /* Why not before? Because we want correct error value */
5435 if (unlikely(last.name[last.len])) {
5436 if (d_is_dir(dentry))
5437 error = -EISDIR;
5438 else
5439 error = -ENOTDIR;
5440 end_dirop(dentry);
5441 goto exit_drop_write;
5442 }
5443 inode = dentry->d_inode;
5444 ihold(inode);
5445 error = security_path_unlink(&path, dentry);
5446 if (error)
5447 goto exit_end_dirop;
5448 error = vfs_unlink(mnt_idmap(path.mnt), path.dentry->d_inode,
5449 dentry, &delegated_inode);
5450exit_end_dirop:
5451 end_dirop(dentry);
5452 iput(inode); /* truncate the inode here */
5453 if (is_delegated(&delegated_inode)) {
5454 error = break_deleg_wait(&delegated_inode);
5455 if (!error)
5456 goto retry_deleg;
5457 }
5458exit_drop_write:
5459 mnt_drop_write(path.mnt);
5460exit_path_put:
5461 path_put(&path);
5462 if (retry_estale(error, lookup_flags)) {
5463 lookup_flags |= LOOKUP_REVAL;
5464 goto retry;
5465 }
5466exit_putname:
5467 putname(name);
5468 return error;
5469}
5470
5471SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag)
5472{
5473 if ((flag & ~AT_REMOVEDIR) != 0)
5474 return -EINVAL;
5475
5476 if (flag & AT_REMOVEDIR)
5477 return do_rmdir(dfd, getname(pathname));
5478 return do_unlinkat(dfd, getname(pathname));
5479}
5480
5481SYSCALL_DEFINE1(unlink, const char __user *, pathname)
5482{
5483 return do_unlinkat(AT_FDCWD, getname(pathname));
5484}
5485
5486/**
5487 * vfs_symlink - create symlink
5488 * @idmap: idmap of the mount the inode was found from
5489 * @dir: inode of the parent directory
5490 * @dentry: dentry of the child symlink file
5491 * @oldname: name of the file to link to
5492 * @delegated_inode: returns victim inode, if the inode is delegated.
5493 *
5494 * Create a symlink.
5495 *
5496 * If the inode has been found through an idmapped mount the idmap of
5497 * the vfsmount must be passed through @idmap. This function will then take
5498 * care to map the inode according to @idmap before checking permissions.
5499 * On non-idmapped mounts or if permission checking is to be performed on the
5500 * raw inode simply pass @nop_mnt_idmap.
5501 */
5502int vfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
5503 struct dentry *dentry, const char *oldname,
5504 struct delegated_inode *delegated_inode)
5505{
5506 int error;
5507
5508 error = may_create(idmap, dir, dentry);
5509 if (error)
5510 return error;
5511
5512 if (!dir->i_op->symlink)
5513 return -EPERM;
5514
5515 error = security_inode_symlink(dir, dentry, oldname);
5516 if (error)
5517 return error;
5518
5519 error = try_break_deleg(dir, delegated_inode);
5520 if (error)
5521 return error;
5522
5523 error = dir->i_op->symlink(idmap, dir, dentry, oldname);
5524 if (!error)
5525 fsnotify_create(dir, dentry);
5526 return error;
5527}
5528EXPORT_SYMBOL(vfs_symlink);
5529
5530int do_symlinkat(struct filename *from, int newdfd, struct filename *to)
5531{
5532 int error;
5533 struct dentry *dentry;
5534 struct path path;
5535 unsigned int lookup_flags = 0;
5536 struct delegated_inode delegated_inode = { };
5537
5538 if (IS_ERR(from)) {
5539 error = PTR_ERR(from);
5540 goto out_putnames;
5541 }
5542retry:
5543 dentry = filename_create(newdfd, to, &path, lookup_flags);
5544 error = PTR_ERR(dentry);
5545 if (IS_ERR(dentry))
5546 goto out_putnames;
5547
5548 error = security_path_symlink(&path, dentry, from->name);
5549 if (!error)
5550 error = vfs_symlink(mnt_idmap(path.mnt), path.dentry->d_inode,
5551 dentry, from->name, &delegated_inode);
5552 end_creating_path(&path, dentry);
5553 if (is_delegated(&delegated_inode)) {
5554 error = break_deleg_wait(&delegated_inode);
5555 if (!error)
5556 goto retry;
5557 }
5558 if (retry_estale(error, lookup_flags)) {
5559 lookup_flags |= LOOKUP_REVAL;
5560 goto retry;
5561 }
5562out_putnames:
5563 putname(to);
5564 putname(from);
5565 return error;
5566}
5567
5568SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
5569 int, newdfd, const char __user *, newname)
5570{
5571 return do_symlinkat(getname(oldname), newdfd, getname(newname));
5572}
5573
5574SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newname)
5575{
5576 return do_symlinkat(getname(oldname), AT_FDCWD, getname(newname));
5577}
5578
5579/**
5580 * vfs_link - create a new link
5581 * @old_dentry: object to be linked
5582 * @idmap: idmap of the mount
5583 * @dir: new parent
5584 * @new_dentry: where to create the new link
5585 * @delegated_inode: returns inode needing a delegation break
5586 *
5587 * The caller must hold dir->i_rwsem exclusively.
5588 *
5589 * If vfs_link discovers a delegation on the to-be-linked file in need
5590 * of breaking, it will return -EWOULDBLOCK and return a reference to the
5591 * inode in delegated_inode. The caller should then break the delegation
5592 * and retry. Because breaking a delegation may take a long time, the
5593 * caller should drop the i_rwsem before doing so.
5594 *
5595 * Alternatively, a caller may pass NULL for delegated_inode. This may
5596 * be appropriate for callers that expect the underlying filesystem not
5597 * to be NFS exported.
5598 *
5599 * If the inode has been found through an idmapped mount the idmap of
5600 * the vfsmount must be passed through @idmap. This function will then take
5601 * care to map the inode according to @idmap before checking permissions.
5602 * On non-idmapped mounts or if permission checking is to be performed on the
5603 * raw inode simply pass @nop_mnt_idmap.
5604 */
5605int vfs_link(struct dentry *old_dentry, struct mnt_idmap *idmap,
5606 struct inode *dir, struct dentry *new_dentry,
5607 struct delegated_inode *delegated_inode)
5608{
5609 struct inode *inode = old_dentry->d_inode;
5610 unsigned max_links = dir->i_sb->s_max_links;
5611 int error;
5612
5613 if (!inode)
5614 return -ENOENT;
5615
5616 error = may_create(idmap, dir, new_dentry);
5617 if (error)
5618 return error;
5619
5620 if (dir->i_sb != inode->i_sb)
5621 return -EXDEV;
5622
5623 /*
5624 * A link to an append-only or immutable file cannot be created.
5625 */
5626 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
5627 return -EPERM;
5628 /*
5629 * Updating the link count will likely cause i_uid and i_gid to
5630 * be written back improperly if their true value is unknown to
5631 * the vfs.
5632 */
5633 if (HAS_UNMAPPED_ID(idmap, inode))
5634 return -EPERM;
5635 if (!dir->i_op->link)
5636 return -EPERM;
5637 if (S_ISDIR(inode->i_mode))
5638 return -EPERM;
5639
5640 error = security_inode_link(old_dentry, dir, new_dentry);
5641 if (error)
5642 return error;
5643
5644 inode_lock(inode);
5645 /* Make sure we don't allow creating hardlink to an unlinked file */
5646 if (inode->i_nlink == 0 && !(inode_state_read_once(inode) & I_LINKABLE))
5647 error = -ENOENT;
5648 else if (max_links && inode->i_nlink >= max_links)
5649 error = -EMLINK;
5650 else {
5651 error = try_break_deleg(dir, delegated_inode);
5652 if (!error)
5653 error = try_break_deleg(inode, delegated_inode);
5654 if (!error)
5655 error = dir->i_op->link(old_dentry, dir, new_dentry);
5656 }
5657
5658 if (!error && (inode_state_read_once(inode) & I_LINKABLE)) {
5659 spin_lock(&inode->i_lock);
5660 inode_state_clear(inode, I_LINKABLE);
5661 spin_unlock(&inode->i_lock);
5662 }
5663 inode_unlock(inode);
5664 if (!error)
5665 fsnotify_link(dir, inode, new_dentry);
5666 return error;
5667}
5668EXPORT_SYMBOL(vfs_link);
5669
5670/*
5671 * Hardlinks are often used in delicate situations. We avoid
5672 * security-related surprises by not following symlinks on the
5673 * newname. --KAB
5674 *
5675 * We don't follow them on the oldname either to be compatible
5676 * with linux 2.0, and to avoid hard-linking to directories
5677 * and other special files. --ADM
5678 */
5679int do_linkat(int olddfd, struct filename *old, int newdfd,
5680 struct filename *new, int flags)
5681{
5682 struct mnt_idmap *idmap;
5683 struct dentry *new_dentry;
5684 struct path old_path, new_path;
5685 struct delegated_inode delegated_inode = { };
5686 int how = 0;
5687 int error;
5688
5689 if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0) {
5690 error = -EINVAL;
5691 goto out_putnames;
5692 }
5693 /*
5694 * To use null names we require CAP_DAC_READ_SEARCH or
5695 * that the open-time creds of the dfd matches current.
5696 * This ensures that not everyone will be able to create
5697 * a hardlink using the passed file descriptor.
5698 */
5699 if (flags & AT_EMPTY_PATH)
5700 how |= LOOKUP_LINKAT_EMPTY;
5701
5702 if (flags & AT_SYMLINK_FOLLOW)
5703 how |= LOOKUP_FOLLOW;
5704retry:
5705 error = filename_lookup(olddfd, old, how, &old_path, NULL);
5706 if (error)
5707 goto out_putnames;
5708
5709 new_dentry = filename_create(newdfd, new, &new_path,
5710 (how & LOOKUP_REVAL));
5711 error = PTR_ERR(new_dentry);
5712 if (IS_ERR(new_dentry))
5713 goto out_putpath;
5714
5715 error = -EXDEV;
5716 if (old_path.mnt != new_path.mnt)
5717 goto out_dput;
5718 idmap = mnt_idmap(new_path.mnt);
5719 error = may_linkat(idmap, &old_path);
5720 if (unlikely(error))
5721 goto out_dput;
5722 error = security_path_link(old_path.dentry, &new_path, new_dentry);
5723 if (error)
5724 goto out_dput;
5725 error = vfs_link(old_path.dentry, idmap, new_path.dentry->d_inode,
5726 new_dentry, &delegated_inode);
5727out_dput:
5728 end_creating_path(&new_path, new_dentry);
5729 if (is_delegated(&delegated_inode)) {
5730 error = break_deleg_wait(&delegated_inode);
5731 if (!error) {
5732 path_put(&old_path);
5733 goto retry;
5734 }
5735 }
5736 if (retry_estale(error, how)) {
5737 path_put(&old_path);
5738 how |= LOOKUP_REVAL;
5739 goto retry;
5740 }
5741out_putpath:
5742 path_put(&old_path);
5743out_putnames:
5744 putname(old);
5745 putname(new);
5746
5747 return error;
5748}
5749
5750SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
5751 int, newdfd, const char __user *, newname, int, flags)
5752{
5753 return do_linkat(olddfd, getname_uflags(oldname, flags),
5754 newdfd, getname(newname), flags);
5755}
5756
5757SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname)
5758{
5759 return do_linkat(AT_FDCWD, getname(oldname), AT_FDCWD, getname(newname), 0);
5760}
5761
5762/**
5763 * vfs_rename - rename a filesystem object
5764 * @rd: pointer to &struct renamedata info
5765 *
5766 * The caller must hold multiple mutexes--see lock_rename()).
5767 *
5768 * If vfs_rename discovers a delegation in need of breaking at either
5769 * the source or destination, it will return -EWOULDBLOCK and return a
5770 * reference to the inode in delegated_inode. The caller should then
5771 * break the delegation and retry. Because breaking a delegation may
5772 * take a long time, the caller should drop all locks before doing
5773 * so.
5774 *
5775 * Alternatively, a caller may pass NULL for delegated_inode. This may
5776 * be appropriate for callers that expect the underlying filesystem not
5777 * to be NFS exported.
5778 *
5779 * The worst of all namespace operations - renaming directory. "Perverted"
5780 * doesn't even start to describe it. Somebody in UCB had a heck of a trip...
5781 * Problems:
5782 *
5783 * a) we can get into loop creation.
5784 * b) race potential - two innocent renames can create a loop together.
5785 * That's where 4.4BSD screws up. Current fix: serialization on
5786 * sb->s_vfs_rename_mutex. We might be more accurate, but that's another
5787 * story.
5788 * c) we may have to lock up to _four_ objects - parents and victim (if it exists),
5789 * and source (if it's a non-directory or a subdirectory that moves to
5790 * different parent).
5791 * And that - after we got ->i_rwsem on parents (until then we don't know
5792 * whether the target exists). Solution: try to be smart with locking
5793 * order for inodes. We rely on the fact that tree topology may change
5794 * only under ->s_vfs_rename_mutex _and_ that parent of the object we
5795 * move will be locked. Thus we can rank directories by the tree
5796 * (ancestors first) and rank all non-directories after them.
5797 * That works since everybody except rename does "lock parent, lookup,
5798 * lock child" and rename is under ->s_vfs_rename_mutex.
5799 * HOWEVER, it relies on the assumption that any object with ->lookup()
5800 * has no more than 1 dentry. If "hybrid" objects will ever appear,
5801 * we'd better make sure that there's no link(2) for them.
5802 * d) conversion from fhandle to dentry may come in the wrong moment - when
5803 * we are removing the target. Solution: we will have to grab ->i_rwsem
5804 * in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
5805 * ->i_rwsem on parents, which works but leads to some truly excessive
5806 * locking].
5807 */
5808int vfs_rename(struct renamedata *rd)
5809{
5810 int error;
5811 struct inode *old_dir = d_inode(rd->old_parent);
5812 struct inode *new_dir = d_inode(rd->new_parent);
5813 struct dentry *old_dentry = rd->old_dentry;
5814 struct dentry *new_dentry = rd->new_dentry;
5815 struct delegated_inode *delegated_inode = rd->delegated_inode;
5816 unsigned int flags = rd->flags;
5817 bool is_dir = d_is_dir(old_dentry);
5818 struct inode *source = old_dentry->d_inode;
5819 struct inode *target = new_dentry->d_inode;
5820 bool new_is_dir = false;
5821 unsigned max_links = new_dir->i_sb->s_max_links;
5822 struct name_snapshot old_name;
5823 bool lock_old_subdir, lock_new_subdir;
5824
5825 if (source == target)
5826 return 0;
5827
5828 error = may_delete(rd->mnt_idmap, old_dir, old_dentry, is_dir);
5829 if (error)
5830 return error;
5831
5832 if (!target) {
5833 error = may_create(rd->mnt_idmap, new_dir, new_dentry);
5834 } else {
5835 new_is_dir = d_is_dir(new_dentry);
5836
5837 if (!(flags & RENAME_EXCHANGE))
5838 error = may_delete(rd->mnt_idmap, new_dir,
5839 new_dentry, is_dir);
5840 else
5841 error = may_delete(rd->mnt_idmap, new_dir,
5842 new_dentry, new_is_dir);
5843 }
5844 if (error)
5845 return error;
5846
5847 if (!old_dir->i_op->rename)
5848 return -EPERM;
5849
5850 /*
5851 * If we are going to change the parent - check write permissions,
5852 * we'll need to flip '..'.
5853 */
5854 if (new_dir != old_dir) {
5855 if (is_dir) {
5856 error = inode_permission(rd->mnt_idmap, source,
5857 MAY_WRITE);
5858 if (error)
5859 return error;
5860 }
5861 if ((flags & RENAME_EXCHANGE) && new_is_dir) {
5862 error = inode_permission(rd->mnt_idmap, target,
5863 MAY_WRITE);
5864 if (error)
5865 return error;
5866 }
5867 }
5868
5869 error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry,
5870 flags);
5871 if (error)
5872 return error;
5873
5874 take_dentry_name_snapshot(&old_name, old_dentry);
5875 dget(new_dentry);
5876 /*
5877 * Lock children.
5878 * The source subdirectory needs to be locked on cross-directory
5879 * rename or cross-directory exchange since its parent changes.
5880 * The target subdirectory needs to be locked on cross-directory
5881 * exchange due to parent change and on any rename due to becoming
5882 * a victim.
5883 * Non-directories need locking in all cases (for NFS reasons);
5884 * they get locked after any subdirectories (in inode address order).
5885 *
5886 * NOTE: WE ONLY LOCK UNRELATED DIRECTORIES IN CROSS-DIRECTORY CASE.
5887 * NEVER, EVER DO THAT WITHOUT ->s_vfs_rename_mutex.
5888 */
5889 lock_old_subdir = new_dir != old_dir;
5890 lock_new_subdir = new_dir != old_dir || !(flags & RENAME_EXCHANGE);
5891 if (is_dir) {
5892 if (lock_old_subdir)
5893 inode_lock_nested(source, I_MUTEX_CHILD);
5894 if (target && (!new_is_dir || lock_new_subdir))
5895 inode_lock(target);
5896 } else if (new_is_dir) {
5897 if (lock_new_subdir)
5898 inode_lock_nested(target, I_MUTEX_CHILD);
5899 inode_lock(source);
5900 } else {
5901 lock_two_nondirectories(source, target);
5902 }
5903
5904 error = -EPERM;
5905 if (IS_SWAPFILE(source) || (target && IS_SWAPFILE(target)))
5906 goto out;
5907
5908 error = -EBUSY;
5909 if (is_local_mountpoint(old_dentry) || is_local_mountpoint(new_dentry))
5910 goto out;
5911
5912 if (max_links && new_dir != old_dir) {
5913 error = -EMLINK;
5914 if (is_dir && !new_is_dir && new_dir->i_nlink >= max_links)
5915 goto out;
5916 if ((flags & RENAME_EXCHANGE) && !is_dir && new_is_dir &&
5917 old_dir->i_nlink >= max_links)
5918 goto out;
5919 }
5920 error = try_break_deleg(old_dir, delegated_inode);
5921 if (error)
5922 goto out;
5923 if (new_dir != old_dir) {
5924 error = try_break_deleg(new_dir, delegated_inode);
5925 if (error)
5926 goto out;
5927 }
5928 if (!is_dir) {
5929 error = try_break_deleg(source, delegated_inode);
5930 if (error)
5931 goto out;
5932 }
5933 if (target && !new_is_dir) {
5934 error = try_break_deleg(target, delegated_inode);
5935 if (error)
5936 goto out;
5937 }
5938 error = old_dir->i_op->rename(rd->mnt_idmap, old_dir, old_dentry,
5939 new_dir, new_dentry, flags);
5940 if (error)
5941 goto out;
5942
5943 if (!(flags & RENAME_EXCHANGE) && target) {
5944 if (is_dir) {
5945 shrink_dcache_parent(new_dentry);
5946 target->i_flags |= S_DEAD;
5947 }
5948 dont_mount(new_dentry);
5949 detach_mounts(new_dentry);
5950 }
5951 if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) {
5952 if (!(flags & RENAME_EXCHANGE))
5953 d_move(old_dentry, new_dentry);
5954 else
5955 d_exchange(old_dentry, new_dentry);
5956 }
5957out:
5958 if (!is_dir || lock_old_subdir)
5959 inode_unlock(source);
5960 if (target && (!new_is_dir || lock_new_subdir))
5961 inode_unlock(target);
5962 dput(new_dentry);
5963 if (!error) {
5964 fsnotify_move(old_dir, new_dir, &old_name.name, is_dir,
5965 !(flags & RENAME_EXCHANGE) ? target : NULL, old_dentry);
5966 if (flags & RENAME_EXCHANGE) {
5967 fsnotify_move(new_dir, old_dir, &old_dentry->d_name,
5968 new_is_dir, NULL, new_dentry);
5969 }
5970 }
5971 release_dentry_name_snapshot(&old_name);
5972
5973 return error;
5974}
5975EXPORT_SYMBOL(vfs_rename);
5976
5977int do_renameat2(int olddfd, struct filename *from, int newdfd,
5978 struct filename *to, unsigned int flags)
5979{
5980 struct renamedata rd;
5981 struct path old_path, new_path;
5982 struct qstr old_last, new_last;
5983 int old_type, new_type;
5984 struct delegated_inode delegated_inode = { };
5985 unsigned int lookup_flags = 0;
5986 bool should_retry = false;
5987 int error = -EINVAL;
5988
5989 if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
5990 goto put_names;
5991
5992 if ((flags & (RENAME_NOREPLACE | RENAME_WHITEOUT)) &&
5993 (flags & RENAME_EXCHANGE))
5994 goto put_names;
5995
5996retry:
5997 error = filename_parentat(olddfd, from, lookup_flags, &old_path,
5998 &old_last, &old_type);
5999 if (error)
6000 goto put_names;
6001
6002 error = filename_parentat(newdfd, to, lookup_flags, &new_path, &new_last,
6003 &new_type);
6004 if (error)
6005 goto exit1;
6006
6007 error = -EXDEV;
6008 if (old_path.mnt != new_path.mnt)
6009 goto exit2;
6010
6011 error = -EBUSY;
6012 if (old_type != LAST_NORM)
6013 goto exit2;
6014
6015 if (flags & RENAME_NOREPLACE)
6016 error = -EEXIST;
6017 if (new_type != LAST_NORM)
6018 goto exit2;
6019
6020 error = mnt_want_write(old_path.mnt);
6021 if (error)
6022 goto exit2;
6023
6024retry_deleg:
6025 rd.old_parent = old_path.dentry;
6026 rd.mnt_idmap = mnt_idmap(old_path.mnt);
6027 rd.new_parent = new_path.dentry;
6028 rd.delegated_inode = &delegated_inode;
6029 rd.flags = flags;
6030
6031 error = __start_renaming(&rd, lookup_flags, &old_last, &new_last);
6032 if (error)
6033 goto exit_lock_rename;
6034
6035 if (flags & RENAME_EXCHANGE) {
6036 if (!d_is_dir(rd.new_dentry)) {
6037 error = -ENOTDIR;
6038 if (new_last.name[new_last.len])
6039 goto exit_unlock;
6040 }
6041 }
6042 /* unless the source is a directory trailing slashes give -ENOTDIR */
6043 if (!d_is_dir(rd.old_dentry)) {
6044 error = -ENOTDIR;
6045 if (old_last.name[old_last.len])
6046 goto exit_unlock;
6047 if (!(flags & RENAME_EXCHANGE) && new_last.name[new_last.len])
6048 goto exit_unlock;
6049 }
6050
6051 error = security_path_rename(&old_path, rd.old_dentry,
6052 &new_path, rd.new_dentry, flags);
6053 if (error)
6054 goto exit_unlock;
6055
6056 error = vfs_rename(&rd);
6057exit_unlock:
6058 end_renaming(&rd);
6059exit_lock_rename:
6060 if (is_delegated(&delegated_inode)) {
6061 error = break_deleg_wait(&delegated_inode);
6062 if (!error)
6063 goto retry_deleg;
6064 }
6065 mnt_drop_write(old_path.mnt);
6066exit2:
6067 if (retry_estale(error, lookup_flags))
6068 should_retry = true;
6069 path_put(&new_path);
6070exit1:
6071 path_put(&old_path);
6072 if (should_retry) {
6073 should_retry = false;
6074 lookup_flags |= LOOKUP_REVAL;
6075 goto retry;
6076 }
6077put_names:
6078 putname(from);
6079 putname(to);
6080 return error;
6081}
6082
6083SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
6084 int, newdfd, const char __user *, newname, unsigned int, flags)
6085{
6086 return do_renameat2(olddfd, getname(oldname), newdfd, getname(newname),
6087 flags);
6088}
6089
6090SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
6091 int, newdfd, const char __user *, newname)
6092{
6093 return do_renameat2(olddfd, getname(oldname), newdfd, getname(newname),
6094 0);
6095}
6096
6097SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname)
6098{
6099 return do_renameat2(AT_FDCWD, getname(oldname), AT_FDCWD,
6100 getname(newname), 0);
6101}
6102
6103int readlink_copy(char __user *buffer, int buflen, const char *link, int linklen)
6104{
6105 int copylen;
6106
6107 copylen = linklen;
6108 if (unlikely(copylen > (unsigned) buflen))
6109 copylen = buflen;
6110 if (copy_to_user(buffer, link, copylen))
6111 copylen = -EFAULT;
6112 return copylen;
6113}
6114
6115/**
6116 * vfs_readlink - copy symlink body into userspace buffer
6117 * @dentry: dentry on which to get symbolic link
6118 * @buffer: user memory pointer
6119 * @buflen: size of buffer
6120 *
6121 * Does not touch atime. That's up to the caller if necessary
6122 *
6123 * Does not call security hook.
6124 */
6125int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen)
6126{
6127 struct inode *inode = d_inode(dentry);
6128 DEFINE_DELAYED_CALL(done);
6129 const char *link;
6130 int res;
6131
6132 if (inode->i_opflags & IOP_CACHED_LINK)
6133 return readlink_copy(buffer, buflen, inode->i_link, inode->i_linklen);
6134
6135 if (unlikely(!(inode->i_opflags & IOP_DEFAULT_READLINK))) {
6136 if (unlikely(inode->i_op->readlink))
6137 return inode->i_op->readlink(dentry, buffer, buflen);
6138
6139 if (!d_is_symlink(dentry))
6140 return -EINVAL;
6141
6142 spin_lock(&inode->i_lock);
6143 inode->i_opflags |= IOP_DEFAULT_READLINK;
6144 spin_unlock(&inode->i_lock);
6145 }
6146
6147 link = READ_ONCE(inode->i_link);
6148 if (!link) {
6149 link = inode->i_op->get_link(dentry, inode, &done);
6150 if (IS_ERR(link))
6151 return PTR_ERR(link);
6152 }
6153 res = readlink_copy(buffer, buflen, link, strlen(link));
6154 do_delayed_call(&done);
6155 return res;
6156}
6157EXPORT_SYMBOL(vfs_readlink);
6158
6159/**
6160 * vfs_get_link - get symlink body
6161 * @dentry: dentry on which to get symbolic link
6162 * @done: caller needs to free returned data with this
6163 *
6164 * Calls security hook and i_op->get_link() on the supplied inode.
6165 *
6166 * It does not touch atime. That's up to the caller if necessary.
6167 *
6168 * Does not work on "special" symlinks like /proc/$$/fd/N
6169 */
6170const char *vfs_get_link(struct dentry *dentry, struct delayed_call *done)
6171{
6172 const char *res = ERR_PTR(-EINVAL);
6173 struct inode *inode = d_inode(dentry);
6174
6175 if (d_is_symlink(dentry)) {
6176 res = ERR_PTR(security_inode_readlink(dentry));
6177 if (!res)
6178 res = inode->i_op->get_link(dentry, inode, done);
6179 }
6180 return res;
6181}
6182EXPORT_SYMBOL(vfs_get_link);
6183
6184/* get the link contents into pagecache */
6185static char *__page_get_link(struct dentry *dentry, struct inode *inode,
6186 struct delayed_call *callback)
6187{
6188 struct folio *folio;
6189 struct address_space *mapping = inode->i_mapping;
6190
6191 if (!dentry) {
6192 folio = filemap_get_folio(mapping, 0);
6193 if (IS_ERR(folio))
6194 return ERR_PTR(-ECHILD);
6195 if (!folio_test_uptodate(folio)) {
6196 folio_put(folio);
6197 return ERR_PTR(-ECHILD);
6198 }
6199 } else {
6200 folio = read_mapping_folio(mapping, 0, NULL);
6201 if (IS_ERR(folio))
6202 return ERR_CAST(folio);
6203 }
6204 set_delayed_call(callback, page_put_link, folio);
6205 BUG_ON(mapping_gfp_mask(mapping) & __GFP_HIGHMEM);
6206 return folio_address(folio);
6207}
6208
6209const char *page_get_link_raw(struct dentry *dentry, struct inode *inode,
6210 struct delayed_call *callback)
6211{
6212 return __page_get_link(dentry, inode, callback);
6213}
6214EXPORT_SYMBOL_GPL(page_get_link_raw);
6215
6216/**
6217 * page_get_link() - An implementation of the get_link inode_operation.
6218 * @dentry: The directory entry which is the symlink.
6219 * @inode: The inode for the symlink.
6220 * @callback: Used to drop the reference to the symlink.
6221 *
6222 * Filesystems which store their symlinks in the page cache should use
6223 * this to implement the get_link() member of their inode_operations.
6224 *
6225 * Return: A pointer to the NUL-terminated symlink.
6226 */
6227const char *page_get_link(struct dentry *dentry, struct inode *inode,
6228 struct delayed_call *callback)
6229{
6230 char *kaddr = __page_get_link(dentry, inode, callback);
6231
6232 if (!IS_ERR(kaddr))
6233 nd_terminate_link(kaddr, inode->i_size, PAGE_SIZE - 1);
6234 return kaddr;
6235}
6236EXPORT_SYMBOL(page_get_link);
6237
6238/**
6239 * page_put_link() - Drop the reference to the symlink.
6240 * @arg: The folio which contains the symlink.
6241 *
6242 * This is used internally by page_get_link(). It is exported for use
6243 * by filesystems which need to implement a variant of page_get_link()
6244 * themselves. Despite the apparent symmetry, filesystems which use
6245 * page_get_link() do not need to call page_put_link().
6246 *
6247 * The argument, while it has a void pointer type, must be a pointer to
6248 * the folio which was retrieved from the page cache. The delayed_call
6249 * infrastructure is used to drop the reference count once the caller
6250 * is done with the symlink.
6251 */
6252void page_put_link(void *arg)
6253{
6254 folio_put(arg);
6255}
6256EXPORT_SYMBOL(page_put_link);
6257
6258int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
6259{
6260 const char *link;
6261 int res;
6262
6263 DEFINE_DELAYED_CALL(done);
6264 link = page_get_link(dentry, d_inode(dentry), &done);
6265 res = PTR_ERR(link);
6266 if (!IS_ERR(link))
6267 res = readlink_copy(buffer, buflen, link, strlen(link));
6268 do_delayed_call(&done);
6269 return res;
6270}
6271EXPORT_SYMBOL(page_readlink);
6272
6273int page_symlink(struct inode *inode, const char *symname, int len)
6274{
6275 struct address_space *mapping = inode->i_mapping;
6276 const struct address_space_operations *aops = mapping->a_ops;
6277 bool nofs = !mapping_gfp_constraint(mapping, __GFP_FS);
6278 struct folio *folio;
6279 void *fsdata = NULL;
6280 int err;
6281 unsigned int flags;
6282
6283retry:
6284 if (nofs)
6285 flags = memalloc_nofs_save();
6286 err = aops->write_begin(NULL, mapping, 0, len-1, &folio, &fsdata);
6287 if (nofs)
6288 memalloc_nofs_restore(flags);
6289 if (err)
6290 goto fail;
6291
6292 memcpy(folio_address(folio), symname, len - 1);
6293
6294 err = aops->write_end(NULL, mapping, 0, len - 1, len - 1,
6295 folio, fsdata);
6296 if (err < 0)
6297 goto fail;
6298 if (err < len-1)
6299 goto retry;
6300
6301 mark_inode_dirty(inode);
6302 return 0;
6303fail:
6304 return err;
6305}
6306EXPORT_SYMBOL(page_symlink);
6307
6308const struct inode_operations page_symlink_inode_operations = {
6309 .get_link = page_get_link,
6310};
6311EXPORT_SYMBOL(page_symlink_inode_operations);