fs/open.c at v6.11-rc1 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / fs / open.c
at v6.11-rc1 1654 lines 41 kB view raw
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 *  linux/fs/open.c
   4 *
   5 *  Copyright (C) 1991, 1992  Linus Torvalds
   6 */
   7
   8#include <linux/string.h>
   9#include <linux/mm.h>
  10#include <linux/file.h>
  11#include <linux/fdtable.h>
  12#include <linux/fsnotify.h>
  13#include <linux/module.h>
  14#include <linux/tty.h>
  15#include <linux/namei.h>
  16#include <linux/backing-dev.h>
  17#include <linux/capability.h>
  18#include <linux/securebits.h>
  19#include <linux/security.h>
  20#include <linux/mount.h>
  21#include <linux/fcntl.h>
  22#include <linux/slab.h>
  23#include <linux/uaccess.h>
  24#include <linux/fs.h>
  25#include <linux/personality.h>
  26#include <linux/pagemap.h>
  27#include <linux/syscalls.h>
  28#include <linux/rcupdate.h>
  29#include <linux/audit.h>
  30#include <linux/falloc.h>
  31#include <linux/fs_struct.h>
  32#include <linux/dnotify.h>
  33#include <linux/compat.h>
  34#include <linux/mnt_idmapping.h>
  35#include <linux/filelock.h>
  36
  37#include "internal.h"
  38
  39int do_truncate(struct mnt_idmap *idmap, struct dentry *dentry,
  40		loff_t length, unsigned int time_attrs, struct file *filp)
  41{
  42	int ret;
  43	struct iattr newattrs;
  44
  45	/* Not pretty: "inode->i_size" shouldn't really be signed. But it is. */
  46	if (length < 0)
  47		return -EINVAL;
  48
  49	newattrs.ia_size = length;
  50	newattrs.ia_valid = ATTR_SIZE | time_attrs;
  51	if (filp) {
  52		newattrs.ia_file = filp;
  53		newattrs.ia_valid |= ATTR_FILE;
  54	}
  55
  56	/* Remove suid, sgid, and file capabilities on truncate too */
  57	ret = dentry_needs_remove_privs(idmap, dentry);
  58	if (ret < 0)
  59		return ret;
  60	if (ret)
  61		newattrs.ia_valid |= ret | ATTR_FORCE;
  62
  63	inode_lock(dentry->d_inode);
  64	/* Note any delegations or leases have already been broken: */
  65	ret = notify_change(idmap, dentry, &newattrs, NULL);
  66	inode_unlock(dentry->d_inode);
  67	return ret;
  68}
  69
  70long vfs_truncate(const struct path *path, loff_t length)
  71{
  72	struct mnt_idmap *idmap;
  73	struct inode *inode;
  74	long error;
  75
  76	inode = path->dentry->d_inode;
  77
  78	/* For directories it's -EISDIR, for other non-regulars - -EINVAL */
  79	if (S_ISDIR(inode->i_mode))
  80		return -EISDIR;
  81	if (!S_ISREG(inode->i_mode))
  82		return -EINVAL;
  83
  84	error = mnt_want_write(path->mnt);
  85	if (error)
  86		goto out;
  87
  88	idmap = mnt_idmap(path->mnt);
  89	error = inode_permission(idmap, inode, MAY_WRITE);
  90	if (error)
  91		goto mnt_drop_write_and_out;
  92
  93	error = -EPERM;
  94	if (IS_APPEND(inode))
  95		goto mnt_drop_write_and_out;
  96
  97	error = get_write_access(inode);
  98	if (error)
  99		goto mnt_drop_write_and_out;
 100
 101	/*
 102	 * Make sure that there are no leases.  get_write_access() protects
 103	 * against the truncate racing with a lease-granting setlease().
 104	 */
 105	error = break_lease(inode, O_WRONLY);
 106	if (error)
 107		goto put_write_and_out;
 108
 109	error = security_path_truncate(path);
 110	if (!error)
 111		error = do_truncate(idmap, path->dentry, length, 0, NULL);
 112
 113put_write_and_out:
 114	put_write_access(inode);
 115mnt_drop_write_and_out:
 116	mnt_drop_write(path->mnt);
 117out:
 118	return error;
 119}
 120EXPORT_SYMBOL_GPL(vfs_truncate);
 121
 122long do_sys_truncate(const char __user *pathname, loff_t length)
 123{
 124	unsigned int lookup_flags = LOOKUP_FOLLOW;
 125	struct path path;
 126	int error;
 127
 128	if (length < 0)	/* sorry, but loff_t says... */
 129		return -EINVAL;
 130
 131retry:
 132	error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
 133	if (!error) {
 134		error = vfs_truncate(&path, length);
 135		path_put(&path);
 136	}
 137	if (retry_estale(error, lookup_flags)) {
 138		lookup_flags |= LOOKUP_REVAL;
 139		goto retry;
 140	}
 141	return error;
 142}
 143
 144SYSCALL_DEFINE2(truncate, const char __user *, path, long, length)
 145{
 146	return do_sys_truncate(path, length);
 147}
 148
 149#ifdef CONFIG_COMPAT
 150COMPAT_SYSCALL_DEFINE2(truncate, const char __user *, path, compat_off_t, length)
 151{
 152	return do_sys_truncate(path, length);
 153}
 154#endif
 155
 156long do_ftruncate(struct file *file, loff_t length, int small)
 157{
 158	struct inode *inode;
 159	struct dentry *dentry;
 160	int error;
 161
 162	/* explicitly opened as large or we are on 64-bit box */
 163	if (file->f_flags & O_LARGEFILE)
 164		small = 0;
 165
 166	dentry = file->f_path.dentry;
 167	inode = dentry->d_inode;
 168	if (!S_ISREG(inode->i_mode) || !(file->f_mode & FMODE_WRITE))
 169		return -EINVAL;
 170
 171	/* Cannot ftruncate over 2^31 bytes without large file support */
 172	if (small && length > MAX_NON_LFS)
 173		return -EINVAL;
 174
 175	/* Check IS_APPEND on real upper inode */
 176	if (IS_APPEND(file_inode(file)))
 177		return -EPERM;
 178	sb_start_write(inode->i_sb);
 179	error = security_file_truncate(file);
 180	if (!error)
 181		error = do_truncate(file_mnt_idmap(file), dentry, length,
 182				    ATTR_MTIME | ATTR_CTIME, file);
 183	sb_end_write(inode->i_sb);
 184
 185	return error;
 186}
 187
 188long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
 189{
 190	struct fd f;
 191	int error;
 192
 193	if (length < 0)
 194		return -EINVAL;
 195	f = fdget(fd);
 196	if (!f.file)
 197		return -EBADF;
 198
 199	error = do_ftruncate(f.file, length, small);
 200
 201	fdput(f);
 202	return error;
 203}
 204
 205SYSCALL_DEFINE2(ftruncate, unsigned int, fd, off_t, length)
 206{
 207	return do_sys_ftruncate(fd, length, 1);
 208}
 209
 210#ifdef CONFIG_COMPAT
 211COMPAT_SYSCALL_DEFINE2(ftruncate, unsigned int, fd, compat_off_t, length)
 212{
 213	return do_sys_ftruncate(fd, length, 1);
 214}
 215#endif
 216
 217/* LFS versions of truncate are only needed on 32 bit machines */
 218#if BITS_PER_LONG == 32
 219SYSCALL_DEFINE2(truncate64, const char __user *, path, loff_t, length)
 220{
 221	return do_sys_truncate(path, length);
 222}
 223
 224SYSCALL_DEFINE2(ftruncate64, unsigned int, fd, loff_t, length)
 225{
 226	return do_sys_ftruncate(fd, length, 0);
 227}
 228#endif /* BITS_PER_LONG == 32 */
 229
 230#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_TRUNCATE64)
 231COMPAT_SYSCALL_DEFINE3(truncate64, const char __user *, pathname,
 232		       compat_arg_u64_dual(length))
 233{
 234	return ksys_truncate(pathname, compat_arg_u64_glue(length));
 235}
 236#endif
 237
 238#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_FTRUNCATE64)
 239COMPAT_SYSCALL_DEFINE3(ftruncate64, unsigned int, fd,
 240		       compat_arg_u64_dual(length))
 241{
 242	return ksys_ftruncate(fd, compat_arg_u64_glue(length));
 243}
 244#endif
 245
 246int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 247{
 248	struct inode *inode = file_inode(file);
 249	long ret;
 250	loff_t sum;
 251
 252	if (offset < 0 || len <= 0)
 253		return -EINVAL;
 254
 255	/* Return error if mode is not supported */
 256	if (mode & ~FALLOC_FL_SUPPORTED_MASK)
 257		return -EOPNOTSUPP;
 258
 259	/* Punch hole and zero range are mutually exclusive */
 260	if ((mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) ==
 261	    (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE))
 262		return -EOPNOTSUPP;
 263
 264	/* Punch hole must have keep size set */
 265	if ((mode & FALLOC_FL_PUNCH_HOLE) &&
 266	    !(mode & FALLOC_FL_KEEP_SIZE))
 267		return -EOPNOTSUPP;
 268
 269	/* Collapse range should only be used exclusively. */
 270	if ((mode & FALLOC_FL_COLLAPSE_RANGE) &&
 271	    (mode & ~FALLOC_FL_COLLAPSE_RANGE))
 272		return -EINVAL;
 273
 274	/* Insert range should only be used exclusively. */
 275	if ((mode & FALLOC_FL_INSERT_RANGE) &&
 276	    (mode & ~FALLOC_FL_INSERT_RANGE))
 277		return -EINVAL;
 278
 279	/* Unshare range should only be used with allocate mode. */
 280	if ((mode & FALLOC_FL_UNSHARE_RANGE) &&
 281	    (mode & ~(FALLOC_FL_UNSHARE_RANGE | FALLOC_FL_KEEP_SIZE)))
 282		return -EINVAL;
 283
 284	if (!(file->f_mode & FMODE_WRITE))
 285		return -EBADF;
 286
 287	/*
 288	 * We can only allow pure fallocate on append only files
 289	 */
 290	if ((mode & ~FALLOC_FL_KEEP_SIZE) && IS_APPEND(inode))
 291		return -EPERM;
 292
 293	if (IS_IMMUTABLE(inode))
 294		return -EPERM;
 295
 296	/*
 297	 * We cannot allow any fallocate operation on an active swapfile
 298	 */
 299	if (IS_SWAPFILE(inode))
 300		return -ETXTBSY;
 301
 302	/*
 303	 * Revalidate the write permissions, in case security policy has
 304	 * changed since the files were opened.
 305	 */
 306	ret = security_file_permission(file, MAY_WRITE);
 307	if (ret)
 308		return ret;
 309
 310	ret = fsnotify_file_area_perm(file, MAY_WRITE, &offset, len);
 311	if (ret)
 312		return ret;
 313
 314	if (S_ISFIFO(inode->i_mode))
 315		return -ESPIPE;
 316
 317	if (S_ISDIR(inode->i_mode))
 318		return -EISDIR;
 319
 320	if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))
 321		return -ENODEV;
 322
 323	/* Check for wraparound */
 324	if (check_add_overflow(offset, len, &sum))
 325		return -EFBIG;
 326
 327	if (sum > inode->i_sb->s_maxbytes)
 328		return -EFBIG;
 329
 330	if (!file->f_op->fallocate)
 331		return -EOPNOTSUPP;
 332
 333	file_start_write(file);
 334	ret = file->f_op->fallocate(file, mode, offset, len);
 335
 336	/*
 337	 * Create inotify and fanotify events.
 338	 *
 339	 * To keep the logic simple always create events if fallocate succeeds.
 340	 * This implies that events are even created if the file size remains
 341	 * unchanged, e.g. when using flag FALLOC_FL_KEEP_SIZE.
 342	 */
 343	if (ret == 0)
 344		fsnotify_modify(file);
 345
 346	file_end_write(file);
 347	return ret;
 348}
 349EXPORT_SYMBOL_GPL(vfs_fallocate);
 350
 351int ksys_fallocate(int fd, int mode, loff_t offset, loff_t len)
 352{
 353	struct fd f = fdget(fd);
 354	int error = -EBADF;
 355
 356	if (f.file) {
 357		error = vfs_fallocate(f.file, mode, offset, len);
 358		fdput(f);
 359	}
 360	return error;
 361}
 362
 363SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len)
 364{
 365	return ksys_fallocate(fd, mode, offset, len);
 366}
 367
 368#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_FALLOCATE)
 369COMPAT_SYSCALL_DEFINE6(fallocate, int, fd, int, mode, compat_arg_u64_dual(offset),
 370		       compat_arg_u64_dual(len))
 371{
 372	return ksys_fallocate(fd, mode, compat_arg_u64_glue(offset),
 373			      compat_arg_u64_glue(len));
 374}
 375#endif
 376
 377/*
 378 * access() needs to use the real uid/gid, not the effective uid/gid.
 379 * We do this by temporarily clearing all FS-related capabilities and
 380 * switching the fsuid/fsgid around to the real ones.
 381 *
 382 * Creating new credentials is expensive, so we try to skip doing it,
 383 * which we can if the result would match what we already got.
 384 */
 385static bool access_need_override_creds(int flags)
 386{
 387	const struct cred *cred;
 388
 389	if (flags & AT_EACCESS)
 390		return false;
 391
 392	cred = current_cred();
 393	if (!uid_eq(cred->fsuid, cred->uid) ||
 394	    !gid_eq(cred->fsgid, cred->gid))
 395		return true;
 396
 397	if (!issecure(SECURE_NO_SETUID_FIXUP)) {
 398		kuid_t root_uid = make_kuid(cred->user_ns, 0);
 399		if (!uid_eq(cred->uid, root_uid)) {
 400			if (!cap_isclear(cred->cap_effective))
 401				return true;
 402		} else {
 403			if (!cap_isidentical(cred->cap_effective,
 404			    cred->cap_permitted))
 405				return true;
 406		}
 407	}
 408
 409	return false;
 410}
 411
 412static const struct cred *access_override_creds(void)
 413{
 414	const struct cred *old_cred;
 415	struct cred *override_cred;
 416
 417	override_cred = prepare_creds();
 418	if (!override_cred)
 419		return NULL;
 420
 421	/*
 422	 * XXX access_need_override_creds performs checks in hopes of skipping
 423	 * this work. Make sure it stays in sync if making any changes in this
 424	 * routine.
 425	 */
 426
 427	override_cred->fsuid = override_cred->uid;
 428	override_cred->fsgid = override_cred->gid;
 429
 430	if (!issecure(SECURE_NO_SETUID_FIXUP)) {
 431		/* Clear the capabilities if we switch to a non-root user */
 432		kuid_t root_uid = make_kuid(override_cred->user_ns, 0);
 433		if (!uid_eq(override_cred->uid, root_uid))
 434			cap_clear(override_cred->cap_effective);
 435		else
 436			override_cred->cap_effective =
 437				override_cred->cap_permitted;
 438	}
 439
 440	/*
 441	 * The new set of credentials can *only* be used in
 442	 * task-synchronous circumstances, and does not need
 443	 * RCU freeing, unless somebody then takes a separate
 444	 * reference to it.
 445	 *
 446	 * NOTE! This is _only_ true because this credential
 447	 * is used purely for override_creds() that installs
 448	 * it as the subjective cred. Other threads will be
 449	 * accessing ->real_cred, not the subjective cred.
 450	 *
 451	 * If somebody _does_ make a copy of this (using the
 452	 * 'get_current_cred()' function), that will clear the
 453	 * non_rcu field, because now that other user may be
 454	 * expecting RCU freeing. But normal thread-synchronous
 455	 * cred accesses will keep things non-racy to avoid RCU
 456	 * freeing.
 457	 */
 458	override_cred->non_rcu = 1;
 459
 460	old_cred = override_creds(override_cred);
 461
 462	/* override_cred() gets its own ref */
 463	put_cred(override_cred);
 464
 465	return old_cred;
 466}
 467
 468static long do_faccessat(int dfd, const char __user *filename, int mode, int flags)
 469{
 470	struct path path;
 471	struct inode *inode;
 472	int res;
 473	unsigned int lookup_flags = LOOKUP_FOLLOW;
 474	const struct cred *old_cred = NULL;
 475
 476	if (mode & ~S_IRWXO)	/* where's F_OK, X_OK, W_OK, R_OK? */
 477		return -EINVAL;
 478
 479	if (flags & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH))
 480		return -EINVAL;
 481
 482	if (flags & AT_SYMLINK_NOFOLLOW)
 483		lookup_flags &= ~LOOKUP_FOLLOW;
 484	if (flags & AT_EMPTY_PATH)
 485		lookup_flags |= LOOKUP_EMPTY;
 486
 487	if (access_need_override_creds(flags)) {
 488		old_cred = access_override_creds();
 489		if (!old_cred)
 490			return -ENOMEM;
 491	}
 492
 493retry:
 494	res = user_path_at(dfd, filename, lookup_flags, &path);
 495	if (res)
 496		goto out;
 497
 498	inode = d_backing_inode(path.dentry);
 499
 500	if ((mode & MAY_EXEC) && S_ISREG(inode->i_mode)) {
 501		/*
 502		 * MAY_EXEC on regular files is denied if the fs is mounted
 503		 * with the "noexec" flag.
 504		 */
 505		res = -EACCES;
 506		if (path_noexec(&path))
 507			goto out_path_release;
 508	}
 509
 510	res = inode_permission(mnt_idmap(path.mnt), inode, mode | MAY_ACCESS);
 511	/* SuS v2 requires we report a read only fs too */
 512	if (res || !(mode & S_IWOTH) || special_file(inode->i_mode))
 513		goto out_path_release;
 514	/*
 515	 * This is a rare case where using __mnt_is_readonly()
 516	 * is OK without a mnt_want/drop_write() pair.  Since
 517	 * no actual write to the fs is performed here, we do
 518	 * not need to telegraph to that to anyone.
 519	 *
 520	 * By doing this, we accept that this access is
 521	 * inherently racy and know that the fs may change
 522	 * state before we even see this result.
 523	 */
 524	if (__mnt_is_readonly(path.mnt))
 525		res = -EROFS;
 526
 527out_path_release:
 528	path_put(&path);
 529	if (retry_estale(res, lookup_flags)) {
 530		lookup_flags |= LOOKUP_REVAL;
 531		goto retry;
 532	}
 533out:
 534	if (old_cred)
 535		revert_creds(old_cred);
 536
 537	return res;
 538}
 539
 540SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode)
 541{
 542	return do_faccessat(dfd, filename, mode, 0);
 543}
 544
 545SYSCALL_DEFINE4(faccessat2, int, dfd, const char __user *, filename, int, mode,
 546		int, flags)
 547{
 548	return do_faccessat(dfd, filename, mode, flags);
 549}
 550
 551SYSCALL_DEFINE2(access, const char __user *, filename, int, mode)
 552{
 553	return do_faccessat(AT_FDCWD, filename, mode, 0);
 554}
 555
 556SYSCALL_DEFINE1(chdir, const char __user *, filename)
 557{
 558	struct path path;
 559	int error;
 560	unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
 561retry:
 562	error = user_path_at(AT_FDCWD, filename, lookup_flags, &path);
 563	if (error)
 564		goto out;
 565
 566	error = path_permission(&path, MAY_EXEC | MAY_CHDIR);
 567	if (error)
 568		goto dput_and_out;
 569
 570	set_fs_pwd(current->fs, &path);
 571
 572dput_and_out:
 573	path_put(&path);
 574	if (retry_estale(error, lookup_flags)) {
 575		lookup_flags |= LOOKUP_REVAL;
 576		goto retry;
 577	}
 578out:
 579	return error;
 580}
 581
 582SYSCALL_DEFINE1(fchdir, unsigned int, fd)
 583{
 584	struct fd f = fdget_raw(fd);
 585	int error;
 586
 587	error = -EBADF;
 588	if (!f.file)
 589		goto out;
 590
 591	error = -ENOTDIR;
 592	if (!d_can_lookup(f.file->f_path.dentry))
 593		goto out_putf;
 594
 595	error = file_permission(f.file, MAY_EXEC | MAY_CHDIR);
 596	if (!error)
 597		set_fs_pwd(current->fs, &f.file->f_path);
 598out_putf:
 599	fdput(f);
 600out:
 601	return error;
 602}
 603
 604SYSCALL_DEFINE1(chroot, const char __user *, filename)
 605{
 606	struct path path;
 607	int error;
 608	unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
 609retry:
 610	error = user_path_at(AT_FDCWD, filename, lookup_flags, &path);
 611	if (error)
 612		goto out;
 613
 614	error = path_permission(&path, MAY_EXEC | MAY_CHDIR);
 615	if (error)
 616		goto dput_and_out;
 617
 618	error = -EPERM;
 619	if (!ns_capable(current_user_ns(), CAP_SYS_CHROOT))
 620		goto dput_and_out;
 621	error = security_path_chroot(&path);
 622	if (error)
 623		goto dput_and_out;
 624
 625	set_fs_root(current->fs, &path);
 626	error = 0;
 627dput_and_out:
 628	path_put(&path);
 629	if (retry_estale(error, lookup_flags)) {
 630		lookup_flags |= LOOKUP_REVAL;
 631		goto retry;
 632	}
 633out:
 634	return error;
 635}
 636
 637int chmod_common(const struct path *path, umode_t mode)
 638{
 639	struct inode *inode = path->dentry->d_inode;
 640	struct inode *delegated_inode = NULL;
 641	struct iattr newattrs;
 642	int error;
 643
 644	error = mnt_want_write(path->mnt);
 645	if (error)
 646		return error;
 647retry_deleg:
 648	inode_lock(inode);
 649	error = security_path_chmod(path, mode);
 650	if (error)
 651		goto out_unlock;
 652	newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
 653	newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
 654	error = notify_change(mnt_idmap(path->mnt), path->dentry,
 655			      &newattrs, &delegated_inode);
 656out_unlock:
 657	inode_unlock(inode);
 658	if (delegated_inode) {
 659		error = break_deleg_wait(&delegated_inode);
 660		if (!error)
 661			goto retry_deleg;
 662	}
 663	mnt_drop_write(path->mnt);
 664	return error;
 665}
 666
 667int vfs_fchmod(struct file *file, umode_t mode)
 668{
 669	audit_file(file);
 670	return chmod_common(&file->f_path, mode);
 671}
 672
 673SYSCALL_DEFINE2(fchmod, unsigned int, fd, umode_t, mode)
 674{
 675	struct fd f = fdget(fd);
 676	int err = -EBADF;
 677
 678	if (f.file) {
 679		err = vfs_fchmod(f.file, mode);
 680		fdput(f);
 681	}
 682	return err;
 683}
 684
 685static int do_fchmodat(int dfd, const char __user *filename, umode_t mode,
 686		       unsigned int flags)
 687{
 688	struct path path;
 689	int error;
 690	unsigned int lookup_flags;
 691
 692	if (unlikely(flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)))
 693		return -EINVAL;
 694
 695	lookup_flags = (flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW;
 696	if (flags & AT_EMPTY_PATH)
 697		lookup_flags |= LOOKUP_EMPTY;
 698
 699retry:
 700	error = user_path_at(dfd, filename, lookup_flags, &path);
 701	if (!error) {
 702		error = chmod_common(&path, mode);
 703		path_put(&path);
 704		if (retry_estale(error, lookup_flags)) {
 705			lookup_flags |= LOOKUP_REVAL;
 706			goto retry;
 707		}
 708	}
 709	return error;
 710}
 711
 712SYSCALL_DEFINE4(fchmodat2, int, dfd, const char __user *, filename,
 713		umode_t, mode, unsigned int, flags)
 714{
 715	return do_fchmodat(dfd, filename, mode, flags);
 716}
 717
 718SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename,
 719		umode_t, mode)
 720{
 721	return do_fchmodat(dfd, filename, mode, 0);
 722}
 723
 724SYSCALL_DEFINE2(chmod, const char __user *, filename, umode_t, mode)
 725{
 726	return do_fchmodat(AT_FDCWD, filename, mode, 0);
 727}
 728
 729/*
 730 * Check whether @kuid is valid and if so generate and set vfsuid_t in
 731 * ia_vfsuid.
 732 *
 733 * Return: true if @kuid is valid, false if not.
 734 */
 735static inline bool setattr_vfsuid(struct iattr *attr, kuid_t kuid)
 736{
 737	if (!uid_valid(kuid))
 738		return false;
 739	attr->ia_valid |= ATTR_UID;
 740	attr->ia_vfsuid = VFSUIDT_INIT(kuid);
 741	return true;
 742}
 743
 744/*
 745 * Check whether @kgid is valid and if so generate and set vfsgid_t in
 746 * ia_vfsgid.
 747 *
 748 * Return: true if @kgid is valid, false if not.
 749 */
 750static inline bool setattr_vfsgid(struct iattr *attr, kgid_t kgid)
 751{
 752	if (!gid_valid(kgid))
 753		return false;
 754	attr->ia_valid |= ATTR_GID;
 755	attr->ia_vfsgid = VFSGIDT_INIT(kgid);
 756	return true;
 757}
 758
 759int chown_common(const struct path *path, uid_t user, gid_t group)
 760{
 761	struct mnt_idmap *idmap;
 762	struct user_namespace *fs_userns;
 763	struct inode *inode = path->dentry->d_inode;
 764	struct inode *delegated_inode = NULL;
 765	int error;
 766	struct iattr newattrs;
 767	kuid_t uid;
 768	kgid_t gid;
 769
 770	uid = make_kuid(current_user_ns(), user);
 771	gid = make_kgid(current_user_ns(), group);
 772
 773	idmap = mnt_idmap(path->mnt);
 774	fs_userns = i_user_ns(inode);
 775
 776retry_deleg:
 777	newattrs.ia_vfsuid = INVALID_VFSUID;
 778	newattrs.ia_vfsgid = INVALID_VFSGID;
 779	newattrs.ia_valid =  ATTR_CTIME;
 780	if ((user != (uid_t)-1) && !setattr_vfsuid(&newattrs, uid))
 781		return -EINVAL;
 782	if ((group != (gid_t)-1) && !setattr_vfsgid(&newattrs, gid))
 783		return -EINVAL;
 784	inode_lock(inode);
 785	if (!S_ISDIR(inode->i_mode))
 786		newattrs.ia_valid |= ATTR_KILL_SUID | ATTR_KILL_PRIV |
 787				     setattr_should_drop_sgid(idmap, inode);
 788	/* Continue to send actual fs values, not the mount values. */
 789	error = security_path_chown(
 790		path,
 791		from_vfsuid(idmap, fs_userns, newattrs.ia_vfsuid),
 792		from_vfsgid(idmap, fs_userns, newattrs.ia_vfsgid));
 793	if (!error)
 794		error = notify_change(idmap, path->dentry, &newattrs,
 795				      &delegated_inode);
 796	inode_unlock(inode);
 797	if (delegated_inode) {
 798		error = break_deleg_wait(&delegated_inode);
 799		if (!error)
 800			goto retry_deleg;
 801	}
 802	return error;
 803}
 804
 805int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group,
 806		int flag)
 807{
 808	struct path path;
 809	int error = -EINVAL;
 810	int lookup_flags;
 811
 812	if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
 813		goto out;
 814
 815	lookup_flags = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW;
 816	if (flag & AT_EMPTY_PATH)
 817		lookup_flags |= LOOKUP_EMPTY;
 818retry:
 819	error = user_path_at(dfd, filename, lookup_flags, &path);
 820	if (error)
 821		goto out;
 822	error = mnt_want_write(path.mnt);
 823	if (error)
 824		goto out_release;
 825	error = chown_common(&path, user, group);
 826	mnt_drop_write(path.mnt);
 827out_release:
 828	path_put(&path);
 829	if (retry_estale(error, lookup_flags)) {
 830		lookup_flags |= LOOKUP_REVAL;
 831		goto retry;
 832	}
 833out:
 834	return error;
 835}
 836
 837SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user,
 838		gid_t, group, int, flag)
 839{
 840	return do_fchownat(dfd, filename, user, group, flag);
 841}
 842
 843SYSCALL_DEFINE3(chown, const char __user *, filename, uid_t, user, gid_t, group)
 844{
 845	return do_fchownat(AT_FDCWD, filename, user, group, 0);
 846}
 847
 848SYSCALL_DEFINE3(lchown, const char __user *, filename, uid_t, user, gid_t, group)
 849{
 850	return do_fchownat(AT_FDCWD, filename, user, group,
 851			   AT_SYMLINK_NOFOLLOW);
 852}
 853
 854int vfs_fchown(struct file *file, uid_t user, gid_t group)
 855{
 856	int error;
 857
 858	error = mnt_want_write_file(file);
 859	if (error)
 860		return error;
 861	audit_file(file);
 862	error = chown_common(&file->f_path, user, group);
 863	mnt_drop_write_file(file);
 864	return error;
 865}
 866
 867int ksys_fchown(unsigned int fd, uid_t user, gid_t group)
 868{
 869	struct fd f = fdget(fd);
 870	int error = -EBADF;
 871
 872	if (f.file) {
 873		error = vfs_fchown(f.file, user, group);
 874		fdput(f);
 875	}
 876	return error;
 877}
 878
 879SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group)
 880{
 881	return ksys_fchown(fd, user, group);
 882}
 883
 884static inline int file_get_write_access(struct file *f)
 885{
 886	int error;
 887
 888	error = get_write_access(f->f_inode);
 889	if (unlikely(error))
 890		return error;
 891	error = mnt_get_write_access(f->f_path.mnt);
 892	if (unlikely(error))
 893		goto cleanup_inode;
 894	if (unlikely(f->f_mode & FMODE_BACKING)) {
 895		error = mnt_get_write_access(backing_file_user_path(f)->mnt);
 896		if (unlikely(error))
 897			goto cleanup_mnt;
 898	}
 899	return 0;
 900
 901cleanup_mnt:
 902	mnt_put_write_access(f->f_path.mnt);
 903cleanup_inode:
 904	put_write_access(f->f_inode);
 905	return error;
 906}
 907
 908static int do_dentry_open(struct file *f,
 909			  int (*open)(struct inode *, struct file *))
 910{
 911	static const struct file_operations empty_fops = {};
 912	struct inode *inode = f->f_path.dentry->d_inode;
 913	int error;
 914
 915	path_get(&f->f_path);
 916	f->f_inode = inode;
 917	f->f_mapping = inode->i_mapping;
 918	f->f_wb_err = filemap_sample_wb_err(f->f_mapping);
 919	f->f_sb_err = file_sample_sb_err(f);
 920
 921	if (unlikely(f->f_flags & O_PATH)) {
 922		f->f_mode = FMODE_PATH | FMODE_OPENED;
 923		f->f_op = &empty_fops;
 924		return 0;
 925	}
 926
 927	if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) {
 928		i_readcount_inc(inode);
 929	} else if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) {
 930		error = file_get_write_access(f);
 931		if (unlikely(error))
 932			goto cleanup_file;
 933		f->f_mode |= FMODE_WRITER;
 934	}
 935
 936	/* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */
 937	if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))
 938		f->f_mode |= FMODE_ATOMIC_POS;
 939
 940	f->f_op = fops_get(inode->i_fop);
 941	if (WARN_ON(!f->f_op)) {
 942		error = -ENODEV;
 943		goto cleanup_all;
 944	}
 945
 946	error = security_file_open(f);
 947	if (error)
 948		goto cleanup_all;
 949
 950	error = break_lease(file_inode(f), f->f_flags);
 951	if (error)
 952		goto cleanup_all;
 953
 954	/* normally all 3 are set; ->open() can clear them if needed */
 955	f->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
 956	if (!open)
 957		open = f->f_op->open;
 958	if (open) {
 959		error = open(inode, f);
 960		if (error)
 961			goto cleanup_all;
 962	}
 963	f->f_mode |= FMODE_OPENED;
 964	if ((f->f_mode & FMODE_READ) &&
 965	     likely(f->f_op->read || f->f_op->read_iter))
 966		f->f_mode |= FMODE_CAN_READ;
 967	if ((f->f_mode & FMODE_WRITE) &&
 968	     likely(f->f_op->write || f->f_op->write_iter))
 969		f->f_mode |= FMODE_CAN_WRITE;
 970	if ((f->f_mode & FMODE_LSEEK) && !f->f_op->llseek)
 971		f->f_mode &= ~FMODE_LSEEK;
 972	if (f->f_mapping->a_ops && f->f_mapping->a_ops->direct_IO)
 973		f->f_mode |= FMODE_CAN_ODIRECT;
 974
 975	f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
 976	f->f_iocb_flags = iocb_flags(f);
 977
 978	file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping);
 979
 980	if ((f->f_flags & O_DIRECT) && !(f->f_mode & FMODE_CAN_ODIRECT))
 981		return -EINVAL;
 982
 983	/*
 984	 * XXX: Huge page cache doesn't support writing yet. Drop all page
 985	 * cache for this file before processing writes.
 986	 */
 987	if (f->f_mode & FMODE_WRITE) {
 988		/*
 989		 * Depends on full fence from get_write_access() to synchronize
 990		 * against collapse_file() regarding i_writecount and nr_thps
 991		 * updates. Ensures subsequent insertion of THPs into the page
 992		 * cache will fail.
 993		 */
 994		if (filemap_nr_thps(inode->i_mapping)) {
 995			struct address_space *mapping = inode->i_mapping;
 996
 997			filemap_invalidate_lock(inode->i_mapping);
 998			/*
 999			 * unmap_mapping_range just need to be called once
1000			 * here, because the private pages is not need to be
1001			 * unmapped mapping (e.g. data segment of dynamic
1002			 * shared libraries here).
1003			 */
1004			unmap_mapping_range(mapping, 0, 0, 0);
1005			truncate_inode_pages(mapping, 0);
1006			filemap_invalidate_unlock(inode->i_mapping);
1007		}
1008	}
1009
1010	return 0;
1011
1012cleanup_all:
1013	if (WARN_ON_ONCE(error > 0))
1014		error = -EINVAL;
1015	fops_put(f->f_op);
1016	put_file_access(f);
1017cleanup_file:
1018	path_put(&f->f_path);
1019	f->f_path.mnt = NULL;
1020	f->f_path.dentry = NULL;
1021	f->f_inode = NULL;
1022	return error;
1023}
1024
1025/**
1026 * finish_open - finish opening a file
1027 * @file: file pointer
1028 * @dentry: pointer to dentry
1029 * @open: open callback
1030 *
1031 * This can be used to finish opening a file passed to i_op->atomic_open().
1032 *
1033 * If the open callback is set to NULL, then the standard f_op->open()
1034 * filesystem callback is substituted.
1035 *
1036 * NB: the dentry reference is _not_ consumed.  If, for example, the dentry is
1037 * the return value of d_splice_alias(), then the caller needs to perform dput()
1038 * on it after finish_open().
1039 *
1040 * Returns zero on success or -errno if the open failed.
1041 */
1042int finish_open(struct file *file, struct dentry *dentry,
1043		int (*open)(struct inode *, struct file *))
1044{
1045	BUG_ON(file->f_mode & FMODE_OPENED); /* once it's opened, it's opened */
1046
1047	file->f_path.dentry = dentry;
1048	return do_dentry_open(file, open);
1049}
1050EXPORT_SYMBOL(finish_open);
1051
1052/**
1053 * finish_no_open - finish ->atomic_open() without opening the file
1054 *
1055 * @file: file pointer
1056 * @dentry: dentry or NULL (as returned from ->lookup())
1057 *
1058 * This can be used to set the result of a successful lookup in ->atomic_open().
1059 *
1060 * NB: unlike finish_open() this function does consume the dentry reference and
1061 * the caller need not dput() it.
1062 *
1063 * Returns "0" which must be the return value of ->atomic_open() after having
1064 * called this function.
1065 */
1066int finish_no_open(struct file *file, struct dentry *dentry)
1067{
1068	file->f_path.dentry = dentry;
1069	return 0;
1070}
1071EXPORT_SYMBOL(finish_no_open);
1072
1073char *file_path(struct file *filp, char *buf, int buflen)
1074{
1075	return d_path(&filp->f_path, buf, buflen);
1076}
1077EXPORT_SYMBOL(file_path);
1078
1079/**
1080 * vfs_open - open the file at the given path
1081 * @path: path to open
1082 * @file: newly allocated file with f_flag initialized
1083 */
1084int vfs_open(const struct path *path, struct file *file)
1085{
1086	int ret;
1087
1088	file->f_path = *path;
1089	ret = do_dentry_open(file, NULL);
1090	if (!ret) {
1091		/*
1092		 * Once we return a file with FMODE_OPENED, __fput() will call
1093		 * fsnotify_close(), so we need fsnotify_open() here for
1094		 * symmetry.
1095		 */
1096		fsnotify_open(file);
1097	}
1098	return ret;
1099}
1100
1101struct file *dentry_open(const struct path *path, int flags,
1102			 const struct cred *cred)
1103{
1104	int error;
1105	struct file *f;
1106
1107	/* We must always pass in a valid mount pointer. */
1108	BUG_ON(!path->mnt);
1109
1110	f = alloc_empty_file(flags, cred);
1111	if (!IS_ERR(f)) {
1112		error = vfs_open(path, f);
1113		if (error) {
1114			fput(f);
1115			f = ERR_PTR(error);
1116		}
1117	}
1118	return f;
1119}
1120EXPORT_SYMBOL(dentry_open);
1121
1122/**
1123 * dentry_create - Create and open a file
1124 * @path: path to create
1125 * @flags: O_ flags
1126 * @mode: mode bits for new file
1127 * @cred: credentials to use
1128 *
1129 * Caller must hold the parent directory's lock, and have prepared
1130 * a negative dentry, placed in @path->dentry, for the new file.
1131 *
1132 * Caller sets @path->mnt to the vfsmount of the filesystem where
1133 * the new file is to be created. The parent directory and the
1134 * negative dentry must reside on the same filesystem instance.
1135 *
1136 * On success, returns a "struct file *". Otherwise a ERR_PTR
1137 * is returned.
1138 */
1139struct file *dentry_create(const struct path *path, int flags, umode_t mode,
1140			   const struct cred *cred)
1141{
1142	struct file *f;
1143	int error;
1144
1145	f = alloc_empty_file(flags, cred);
1146	if (IS_ERR(f))
1147		return f;
1148
1149	error = vfs_create(mnt_idmap(path->mnt),
1150			   d_inode(path->dentry->d_parent),
1151			   path->dentry, mode, true);
1152	if (!error)
1153		error = vfs_open(path, f);
1154
1155	if (unlikely(error)) {
1156		fput(f);
1157		return ERR_PTR(error);
1158	}
1159	return f;
1160}
1161EXPORT_SYMBOL(dentry_create);
1162
1163/**
1164 * kernel_file_open - open a file for kernel internal use
1165 * @path:	path of the file to open
1166 * @flags:	open flags
1167 * @cred:	credentials for open
1168 *
1169 * Open a file for use by in-kernel consumers. The file is not accounted
1170 * against nr_files and must not be installed into the file descriptor
1171 * table.
1172 *
1173 * Return: Opened file on success, an error pointer on failure.
1174 */
1175struct file *kernel_file_open(const struct path *path, int flags,
1176				const struct cred *cred)
1177{
1178	struct file *f;
1179	int error;
1180
1181	f = alloc_empty_file_noaccount(flags, cred);
1182	if (IS_ERR(f))
1183		return f;
1184
1185	f->f_path = *path;
1186	error = do_dentry_open(f, NULL);
1187	if (error) {
1188		fput(f);
1189		return ERR_PTR(error);
1190	}
1191
1192	fsnotify_open(f);
1193	return f;
1194}
1195EXPORT_SYMBOL_GPL(kernel_file_open);
1196
1197#define WILL_CREATE(flags)	(flags & (O_CREAT | __O_TMPFILE))
1198#define O_PATH_FLAGS		(O_DIRECTORY | O_NOFOLLOW | O_PATH | O_CLOEXEC)
1199
1200inline struct open_how build_open_how(int flags, umode_t mode)
1201{
1202	struct open_how how = {
1203		.flags = flags & VALID_OPEN_FLAGS,
1204		.mode = mode & S_IALLUGO,
1205	};
1206
1207	/* O_PATH beats everything else. */
1208	if (how.flags & O_PATH)
1209		how.flags &= O_PATH_FLAGS;
1210	/* Modes should only be set for create-like flags. */
1211	if (!WILL_CREATE(how.flags))
1212		how.mode = 0;
1213	return how;
1214}
1215
1216inline int build_open_flags(const struct open_how *how, struct open_flags *op)
1217{
1218	u64 flags = how->flags;
1219	u64 strip = __FMODE_NONOTIFY | O_CLOEXEC;
1220	int lookup_flags = 0;
1221	int acc_mode = ACC_MODE(flags);
1222
1223	BUILD_BUG_ON_MSG(upper_32_bits(VALID_OPEN_FLAGS),
1224			 "struct open_flags doesn't yet handle flags > 32 bits");
1225
1226	/*
1227	 * Strip flags that either shouldn't be set by userspace like
1228	 * FMODE_NONOTIFY or that aren't relevant in determining struct
1229	 * open_flags like O_CLOEXEC.
1230	 */
1231	flags &= ~strip;
1232
1233	/*
1234	 * Older syscalls implicitly clear all of the invalid flags or argument
1235	 * values before calling build_open_flags(), but openat2(2) checks all
1236	 * of its arguments.
1237	 */
1238	if (flags & ~VALID_OPEN_FLAGS)
1239		return -EINVAL;
1240	if (how->resolve & ~VALID_RESOLVE_FLAGS)
1241		return -EINVAL;
1242
1243	/* Scoping flags are mutually exclusive. */
1244	if ((how->resolve & RESOLVE_BENEATH) && (how->resolve & RESOLVE_IN_ROOT))
1245		return -EINVAL;
1246
1247	/* Deal with the mode. */
1248	if (WILL_CREATE(flags)) {
1249		if (how->mode & ~S_IALLUGO)
1250			return -EINVAL;
1251		op->mode = how->mode | S_IFREG;
1252	} else {
1253		if (how->mode != 0)
1254			return -EINVAL;
1255		op->mode = 0;
1256	}
1257
1258	/*
1259	 * Block bugs where O_DIRECTORY | O_CREAT created regular files.
1260	 * Note, that blocking O_DIRECTORY | O_CREAT here also protects
1261	 * O_TMPFILE below which requires O_DIRECTORY being raised.
1262	 */
1263	if ((flags & (O_DIRECTORY | O_CREAT)) == (O_DIRECTORY | O_CREAT))
1264		return -EINVAL;
1265
1266	/* Now handle the creative implementation of O_TMPFILE. */
1267	if (flags & __O_TMPFILE) {
1268		/*
1269		 * In order to ensure programs get explicit errors when trying
1270		 * to use O_TMPFILE on old kernels we enforce that O_DIRECTORY
1271		 * is raised alongside __O_TMPFILE.
1272		 */
1273		if (!(flags & O_DIRECTORY))
1274			return -EINVAL;
1275		if (!(acc_mode & MAY_WRITE))
1276			return -EINVAL;
1277	}
1278	if (flags & O_PATH) {
1279		/* O_PATH only permits certain other flags to be set. */
1280		if (flags & ~O_PATH_FLAGS)
1281			return -EINVAL;
1282		acc_mode = 0;
1283	}
1284
1285	/*
1286	 * O_SYNC is implemented as __O_SYNC|O_DSYNC.  As many places only
1287	 * check for O_DSYNC if the need any syncing at all we enforce it's
1288	 * always set instead of having to deal with possibly weird behaviour
1289	 * for malicious applications setting only __O_SYNC.
1290	 */
1291	if (flags & __O_SYNC)
1292		flags |= O_DSYNC;
1293
1294	op->open_flag = flags;
1295
1296	/* O_TRUNC implies we need access checks for write permissions */
1297	if (flags & O_TRUNC)
1298		acc_mode |= MAY_WRITE;
1299
1300	/* Allow the LSM permission hook to distinguish append
1301	   access from general write access. */
1302	if (flags & O_APPEND)
1303		acc_mode |= MAY_APPEND;
1304
1305	op->acc_mode = acc_mode;
1306
1307	op->intent = flags & O_PATH ? 0 : LOOKUP_OPEN;
1308
1309	if (flags & O_CREAT) {
1310		op->intent |= LOOKUP_CREATE;
1311		if (flags & O_EXCL) {
1312			op->intent |= LOOKUP_EXCL;
1313			flags |= O_NOFOLLOW;
1314		}
1315	}
1316
1317	if (flags & O_DIRECTORY)
1318		lookup_flags |= LOOKUP_DIRECTORY;
1319	if (!(flags & O_NOFOLLOW))
1320		lookup_flags |= LOOKUP_FOLLOW;
1321
1322	if (how->resolve & RESOLVE_NO_XDEV)
1323		lookup_flags |= LOOKUP_NO_XDEV;
1324	if (how->resolve & RESOLVE_NO_MAGICLINKS)
1325		lookup_flags |= LOOKUP_NO_MAGICLINKS;
1326	if (how->resolve & RESOLVE_NO_SYMLINKS)
1327		lookup_flags |= LOOKUP_NO_SYMLINKS;
1328	if (how->resolve & RESOLVE_BENEATH)
1329		lookup_flags |= LOOKUP_BENEATH;
1330	if (how->resolve & RESOLVE_IN_ROOT)
1331		lookup_flags |= LOOKUP_IN_ROOT;
1332	if (how->resolve & RESOLVE_CACHED) {
1333		/* Don't bother even trying for create/truncate/tmpfile open */
1334		if (flags & (O_TRUNC | O_CREAT | __O_TMPFILE))
1335			return -EAGAIN;
1336		lookup_flags |= LOOKUP_CACHED;
1337	}
1338
1339	op->lookup_flags = lookup_flags;
1340	return 0;
1341}
1342
1343/**
1344 * file_open_name - open file and return file pointer
1345 *
1346 * @name:	struct filename containing path to open
1347 * @flags:	open flags as per the open(2) second argument
1348 * @mode:	mode for the new file if O_CREAT is set, else ignored
1349 *
1350 * This is the helper to open a file from kernelspace if you really
1351 * have to.  But in generally you should not do this, so please move
1352 * along, nothing to see here..
1353 */
1354struct file *file_open_name(struct filename *name, int flags, umode_t mode)
1355{
1356	struct open_flags op;
1357	struct open_how how = build_open_how(flags, mode);
1358	int err = build_open_flags(&how, &op);
1359	if (err)
1360		return ERR_PTR(err);
1361	return do_filp_open(AT_FDCWD, name, &op);
1362}
1363
1364/**
1365 * filp_open - open file and return file pointer
1366 *
1367 * @filename:	path to open
1368 * @flags:	open flags as per the open(2) second argument
1369 * @mode:	mode for the new file if O_CREAT is set, else ignored
1370 *
1371 * This is the helper to open a file from kernelspace if you really
1372 * have to.  But in generally you should not do this, so please move
1373 * along, nothing to see here..
1374 */
1375struct file *filp_open(const char *filename, int flags, umode_t mode)
1376{
1377	struct filename *name = getname_kernel(filename);
1378	struct file *file = ERR_CAST(name);
1379
1380	if (!IS_ERR(name)) {
1381		file = file_open_name(name, flags, mode);
1382		putname(name);
1383	}
1384	return file;
1385}
1386EXPORT_SYMBOL(filp_open);
1387
1388struct file *file_open_root(const struct path *root,
1389			    const char *filename, int flags, umode_t mode)
1390{
1391	struct open_flags op;
1392	struct open_how how = build_open_how(flags, mode);
1393	int err = build_open_flags(&how, &op);
1394	if (err)
1395		return ERR_PTR(err);
1396	return do_file_open_root(root, filename, &op);
1397}
1398EXPORT_SYMBOL(file_open_root);
1399
1400static long do_sys_openat2(int dfd, const char __user *filename,
1401			   struct open_how *how)
1402{
1403	struct open_flags op;
1404	int fd = build_open_flags(how, &op);
1405	struct filename *tmp;
1406
1407	if (fd)
1408		return fd;
1409
1410	tmp = getname(filename);
1411	if (IS_ERR(tmp))
1412		return PTR_ERR(tmp);
1413
1414	fd = get_unused_fd_flags(how->flags);
1415	if (fd >= 0) {
1416		struct file *f = do_filp_open(dfd, tmp, &op);
1417		if (IS_ERR(f)) {
1418			put_unused_fd(fd);
1419			fd = PTR_ERR(f);
1420		} else {
1421			fd_install(fd, f);
1422		}
1423	}
1424	putname(tmp);
1425	return fd;
1426}
1427
1428long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode)
1429{
1430	struct open_how how = build_open_how(flags, mode);
1431	return do_sys_openat2(dfd, filename, &how);
1432}
1433
1434
1435SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode)
1436{
1437	if (force_o_largefile())
1438		flags |= O_LARGEFILE;
1439	return do_sys_open(AT_FDCWD, filename, flags, mode);
1440}
1441
1442SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags,
1443		umode_t, mode)
1444{
1445	if (force_o_largefile())
1446		flags |= O_LARGEFILE;
1447	return do_sys_open(dfd, filename, flags, mode);
1448}
1449
1450SYSCALL_DEFINE4(openat2, int, dfd, const char __user *, filename,
1451		struct open_how __user *, how, size_t, usize)
1452{
1453	int err;
1454	struct open_how tmp;
1455
1456	BUILD_BUG_ON(sizeof(struct open_how) < OPEN_HOW_SIZE_VER0);
1457	BUILD_BUG_ON(sizeof(struct open_how) != OPEN_HOW_SIZE_LATEST);
1458
1459	if (unlikely(usize < OPEN_HOW_SIZE_VER0))
1460		return -EINVAL;
1461
1462	err = copy_struct_from_user(&tmp, sizeof(tmp), how, usize);
1463	if (err)
1464		return err;
1465
1466	audit_openat2_how(&tmp);
1467
1468	/* O_LARGEFILE is only allowed for non-O_PATH. */
1469	if (!(tmp.flags & O_PATH) && force_o_largefile())
1470		tmp.flags |= O_LARGEFILE;
1471
1472	return do_sys_openat2(dfd, filename, &tmp);
1473}
1474
1475#ifdef CONFIG_COMPAT
1476/*
1477 * Exactly like sys_open(), except that it doesn't set the
1478 * O_LARGEFILE flag.
1479 */
1480COMPAT_SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode)
1481{
1482	return do_sys_open(AT_FDCWD, filename, flags, mode);
1483}
1484
1485/*
1486 * Exactly like sys_openat(), except that it doesn't set the
1487 * O_LARGEFILE flag.
1488 */
1489COMPAT_SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, umode_t, mode)
1490{
1491	return do_sys_open(dfd, filename, flags, mode);
1492}
1493#endif
1494
1495#ifndef __alpha__
1496
1497/*
1498 * For backward compatibility?  Maybe this should be moved
1499 * into arch/i386 instead?
1500 */
1501SYSCALL_DEFINE2(creat, const char __user *, pathname, umode_t, mode)
1502{
1503	int flags = O_CREAT | O_WRONLY | O_TRUNC;
1504
1505	if (force_o_largefile())
1506		flags |= O_LARGEFILE;
1507	return do_sys_open(AT_FDCWD, pathname, flags, mode);
1508}
1509#endif
1510
1511/*
1512 * "id" is the POSIX thread ID. We use the
1513 * files pointer for this..
1514 */
1515static int filp_flush(struct file *filp, fl_owner_t id)
1516{
1517	int retval = 0;
1518
1519	if (CHECK_DATA_CORRUPTION(file_count(filp) == 0,
1520			"VFS: Close: file count is 0 (f_op=%ps)",
1521			filp->f_op)) {
1522		return 0;
1523	}
1524
1525	if (filp->f_op->flush)
1526		retval = filp->f_op->flush(filp, id);
1527
1528	if (likely(!(filp->f_mode & FMODE_PATH))) {
1529		dnotify_flush(filp, id);
1530		locks_remove_posix(filp, id);
1531	}
1532	return retval;
1533}
1534
1535int filp_close(struct file *filp, fl_owner_t id)
1536{
1537	int retval;
1538
1539	retval = filp_flush(filp, id);
1540	fput(filp);
1541
1542	return retval;
1543}
1544EXPORT_SYMBOL(filp_close);
1545
1546/*
1547 * Careful here! We test whether the file pointer is NULL before
1548 * releasing the fd. This ensures that one clone task can't release
1549 * an fd while another clone is opening it.
1550 */
1551SYSCALL_DEFINE1(close, unsigned int, fd)
1552{
1553	int retval;
1554	struct file *file;
1555
1556	file = file_close_fd(fd);
1557	if (!file)
1558		return -EBADF;
1559
1560	retval = filp_flush(file, current->files);
1561
1562	/*
1563	 * We're returning to user space. Don't bother
1564	 * with any delayed fput() cases.
1565	 */
1566	__fput_sync(file);
1567
1568	/* can't restart close syscall because file table entry was cleared */
1569	if (unlikely(retval == -ERESTARTSYS ||
1570		     retval == -ERESTARTNOINTR ||
1571		     retval == -ERESTARTNOHAND ||
1572		     retval == -ERESTART_RESTARTBLOCK))
1573		retval = -EINTR;
1574
1575	return retval;
1576}
1577
1578/**
1579 * sys_close_range() - Close all file descriptors in a given range.
1580 *
1581 * @fd:     starting file descriptor to close
1582 * @max_fd: last file descriptor to close
1583 * @flags:  reserved for future extensions
1584 *
1585 * This closes a range of file descriptors. All file descriptors
1586 * from @fd up to and including @max_fd are closed.
1587 * Currently, errors to close a given file descriptor are ignored.
1588 */
1589SYSCALL_DEFINE3(close_range, unsigned int, fd, unsigned int, max_fd,
1590		unsigned int, flags)
1591{
1592	return __close_range(fd, max_fd, flags);
1593}
1594
1595/*
1596 * This routine simulates a hangup on the tty, to arrange that users
1597 * are given clean terminals at login time.
1598 */
1599SYSCALL_DEFINE0(vhangup)
1600{
1601	if (capable(CAP_SYS_TTY_CONFIG)) {
1602		tty_vhangup_self();
1603		return 0;
1604	}
1605	return -EPERM;
1606}
1607
1608/*
1609 * Called when an inode is about to be open.
1610 * We use this to disallow opening large files on 32bit systems if
1611 * the caller didn't specify O_LARGEFILE.  On 64bit systems we force
1612 * on this flag in sys_open.
1613 */
1614int generic_file_open(struct inode * inode, struct file * filp)
1615{
1616	if (!(filp->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
1617		return -EOVERFLOW;
1618	return 0;
1619}
1620
1621EXPORT_SYMBOL(generic_file_open);
1622
1623/*
1624 * This is used by subsystems that don't want seekable
1625 * file descriptors. The function is not supposed to ever fail, the only
1626 * reason it returns an 'int' and not 'void' is so that it can be plugged
1627 * directly into file_operations structure.
1628 */
1629int nonseekable_open(struct inode *inode, struct file *filp)
1630{
1631	filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
1632	return 0;
1633}
1634
1635EXPORT_SYMBOL(nonseekable_open);
1636
1637/*
1638 * stream_open is used by subsystems that want stream-like file descriptors.
1639 * Such file descriptors are not seekable and don't have notion of position
1640 * (file.f_pos is always 0 and ppos passed to .read()/.write() is always NULL).
1641 * Contrary to file descriptors of other regular files, .read() and .write()
1642 * can run simultaneously.
1643 *
1644 * stream_open never fails and is marked to return int so that it could be
1645 * directly used as file_operations.open .
1646 */
1647int stream_open(struct inode *inode, struct file *filp)
1648{
1649	filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE | FMODE_ATOMIC_POS);
1650	filp->f_mode |= FMODE_STREAM;
1651	return 0;
1652}
1653
1654EXPORT_SYMBOL(stream_open);