fs/open.c at v6.15-rc3 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / fs / open.c
at v6.15-rc3 1655 lines 41 kB view raw
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 *  linux/fs/open.c
   4 *
   5 *  Copyright (C) 1991, 1992  Linus Torvalds
   6 */
   7
   8#include <linux/string.h>
   9#include <linux/mm.h>
  10#include <linux/file.h>
  11#include <linux/fdtable.h>
  12#include <linux/fsnotify.h>
  13#include <linux/module.h>
  14#include <linux/tty.h>
  15#include <linux/namei.h>
  16#include <linux/backing-dev.h>
  17#include <linux/capability.h>
  18#include <linux/securebits.h>
  19#include <linux/security.h>
  20#include <linux/mount.h>
  21#include <linux/fcntl.h>
  22#include <linux/slab.h>
  23#include <linux/uaccess.h>
  24#include <linux/fs.h>
  25#include <linux/personality.h>
  26#include <linux/pagemap.h>
  27#include <linux/syscalls.h>
  28#include <linux/rcupdate.h>
  29#include <linux/audit.h>
  30#include <linux/falloc.h>
  31#include <linux/fs_struct.h>
  32#include <linux/dnotify.h>
  33#include <linux/compat.h>
  34#include <linux/mnt_idmapping.h>
  35#include <linux/filelock.h>
  36
  37#include "internal.h"
  38
  39int do_truncate(struct mnt_idmap *idmap, struct dentry *dentry,
  40		loff_t length, unsigned int time_attrs, struct file *filp)
  41{
  42	int ret;
  43	struct iattr newattrs;
  44
  45	/* Not pretty: "inode->i_size" shouldn't really be signed. But it is. */
  46	if (length < 0)
  47		return -EINVAL;
  48
  49	newattrs.ia_size = length;
  50	newattrs.ia_valid = ATTR_SIZE | time_attrs;
  51	if (filp) {
  52		newattrs.ia_file = filp;
  53		newattrs.ia_valid |= ATTR_FILE;
  54	}
  55
  56	/* Remove suid, sgid, and file capabilities on truncate too */
  57	ret = dentry_needs_remove_privs(idmap, dentry);
  58	if (ret < 0)
  59		return ret;
  60	if (ret)
  61		newattrs.ia_valid |= ret | ATTR_FORCE;
  62
  63	inode_lock(dentry->d_inode);
  64	/* Note any delegations or leases have already been broken: */
  65	ret = notify_change(idmap, dentry, &newattrs, NULL);
  66	inode_unlock(dentry->d_inode);
  67	return ret;
  68}
  69
  70int vfs_truncate(const struct path *path, loff_t length)
  71{
  72	struct mnt_idmap *idmap;
  73	struct inode *inode;
  74	int error;
  75
  76	inode = path->dentry->d_inode;
  77
  78	/* For directories it's -EISDIR, for other non-regulars - -EINVAL */
  79	if (S_ISDIR(inode->i_mode))
  80		return -EISDIR;
  81	if (!S_ISREG(inode->i_mode))
  82		return -EINVAL;
  83
  84	idmap = mnt_idmap(path->mnt);
  85	error = inode_permission(idmap, inode, MAY_WRITE);
  86	if (error)
  87		return error;
  88
  89	error = fsnotify_truncate_perm(path, length);
  90	if (error)
  91		return error;
  92
  93	error = mnt_want_write(path->mnt);
  94	if (error)
  95		return error;
  96
  97	error = -EPERM;
  98	if (IS_APPEND(inode))
  99		goto mnt_drop_write_and_out;
 100
 101	error = get_write_access(inode);
 102	if (error)
 103		goto mnt_drop_write_and_out;
 104
 105	/*
 106	 * Make sure that there are no leases.  get_write_access() protects
 107	 * against the truncate racing with a lease-granting setlease().
 108	 */
 109	error = break_lease(inode, O_WRONLY);
 110	if (error)
 111		goto put_write_and_out;
 112
 113	error = security_path_truncate(path);
 114	if (!error)
 115		error = do_truncate(idmap, path->dentry, length, 0, NULL);
 116
 117put_write_and_out:
 118	put_write_access(inode);
 119mnt_drop_write_and_out:
 120	mnt_drop_write(path->mnt);
 121
 122	return error;
 123}
 124EXPORT_SYMBOL_GPL(vfs_truncate);
 125
 126int do_sys_truncate(const char __user *pathname, loff_t length)
 127{
 128	unsigned int lookup_flags = LOOKUP_FOLLOW;
 129	struct path path;
 130	int error;
 131
 132	if (length < 0)	/* sorry, but loff_t says... */
 133		return -EINVAL;
 134
 135retry:
 136	error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
 137	if (!error) {
 138		error = vfs_truncate(&path, length);
 139		path_put(&path);
 140	}
 141	if (retry_estale(error, lookup_flags)) {
 142		lookup_flags |= LOOKUP_REVAL;
 143		goto retry;
 144	}
 145	return error;
 146}
 147
 148SYSCALL_DEFINE2(truncate, const char __user *, path, long, length)
 149{
 150	return do_sys_truncate(path, length);
 151}
 152
 153#ifdef CONFIG_COMPAT
 154COMPAT_SYSCALL_DEFINE2(truncate, const char __user *, path, compat_off_t, length)
 155{
 156	return do_sys_truncate(path, length);
 157}
 158#endif
 159
 160int do_ftruncate(struct file *file, loff_t length, int small)
 161{
 162	struct inode *inode;
 163	struct dentry *dentry;
 164	int error;
 165
 166	/* explicitly opened as large or we are on 64-bit box */
 167	if (file->f_flags & O_LARGEFILE)
 168		small = 0;
 169
 170	dentry = file->f_path.dentry;
 171	inode = dentry->d_inode;
 172	if (!S_ISREG(inode->i_mode) || !(file->f_mode & FMODE_WRITE))
 173		return -EINVAL;
 174
 175	/* Cannot ftruncate over 2^31 bytes without large file support */
 176	if (small && length > MAX_NON_LFS)
 177		return -EINVAL;
 178
 179	/* Check IS_APPEND on real upper inode */
 180	if (IS_APPEND(file_inode(file)))
 181		return -EPERM;
 182
 183	error = security_file_truncate(file);
 184	if (error)
 185		return error;
 186
 187	error = fsnotify_truncate_perm(&file->f_path, length);
 188	if (error)
 189		return error;
 190
 191	sb_start_write(inode->i_sb);
 192	error = do_truncate(file_mnt_idmap(file), dentry, length,
 193			    ATTR_MTIME | ATTR_CTIME, file);
 194	sb_end_write(inode->i_sb);
 195
 196	return error;
 197}
 198
 199int do_sys_ftruncate(unsigned int fd, loff_t length, int small)
 200{
 201	if (length < 0)
 202		return -EINVAL;
 203	CLASS(fd, f)(fd);
 204	if (fd_empty(f))
 205		return -EBADF;
 206
 207	return do_ftruncate(fd_file(f), length, small);
 208}
 209
 210SYSCALL_DEFINE2(ftruncate, unsigned int, fd, off_t, length)
 211{
 212	return do_sys_ftruncate(fd, length, 1);
 213}
 214
 215#ifdef CONFIG_COMPAT
 216COMPAT_SYSCALL_DEFINE2(ftruncate, unsigned int, fd, compat_off_t, length)
 217{
 218	return do_sys_ftruncate(fd, length, 1);
 219}
 220#endif
 221
 222/* LFS versions of truncate are only needed on 32 bit machines */
 223#if BITS_PER_LONG == 32
 224SYSCALL_DEFINE2(truncate64, const char __user *, path, loff_t, length)
 225{
 226	return do_sys_truncate(path, length);
 227}
 228
 229SYSCALL_DEFINE2(ftruncate64, unsigned int, fd, loff_t, length)
 230{
 231	return do_sys_ftruncate(fd, length, 0);
 232}
 233#endif /* BITS_PER_LONG == 32 */
 234
 235#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_TRUNCATE64)
 236COMPAT_SYSCALL_DEFINE3(truncate64, const char __user *, pathname,
 237		       compat_arg_u64_dual(length))
 238{
 239	return ksys_truncate(pathname, compat_arg_u64_glue(length));
 240}
 241#endif
 242
 243#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_FTRUNCATE64)
 244COMPAT_SYSCALL_DEFINE3(ftruncate64, unsigned int, fd,
 245		       compat_arg_u64_dual(length))
 246{
 247	return ksys_ftruncate(fd, compat_arg_u64_glue(length));
 248}
 249#endif
 250
 251int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 252{
 253	struct inode *inode = file_inode(file);
 254	int ret;
 255	loff_t sum;
 256
 257	if (offset < 0 || len <= 0)
 258		return -EINVAL;
 259
 260	if (mode & ~(FALLOC_FL_MODE_MASK | FALLOC_FL_KEEP_SIZE))
 261		return -EOPNOTSUPP;
 262
 263	/*
 264	 * Modes are exclusive, even if that is not obvious from the encoding
 265	 * as bit masks and the mix with the flag in the same namespace.
 266	 *
 267	 * To make things even more complicated, FALLOC_FL_ALLOCATE_RANGE is
 268	 * encoded as no bit set.
 269	 */
 270	switch (mode & FALLOC_FL_MODE_MASK) {
 271	case FALLOC_FL_ALLOCATE_RANGE:
 272	case FALLOC_FL_UNSHARE_RANGE:
 273	case FALLOC_FL_ZERO_RANGE:
 274		break;
 275	case FALLOC_FL_PUNCH_HOLE:
 276		if (!(mode & FALLOC_FL_KEEP_SIZE))
 277			return -EOPNOTSUPP;
 278		break;
 279	case FALLOC_FL_COLLAPSE_RANGE:
 280	case FALLOC_FL_INSERT_RANGE:
 281		if (mode & FALLOC_FL_KEEP_SIZE)
 282			return -EOPNOTSUPP;
 283		break;
 284	default:
 285		return -EOPNOTSUPP;
 286	}
 287
 288	if (!(file->f_mode & FMODE_WRITE))
 289		return -EBADF;
 290
 291	/*
 292	 * On append-only files only space preallocation is supported.
 293	 */
 294	if ((mode & ~FALLOC_FL_KEEP_SIZE) && IS_APPEND(inode))
 295		return -EPERM;
 296
 297	if (IS_IMMUTABLE(inode))
 298		return -EPERM;
 299
 300	/*
 301	 * We cannot allow any fallocate operation on an active swapfile
 302	 */
 303	if (IS_SWAPFILE(inode))
 304		return -ETXTBSY;
 305
 306	/*
 307	 * Revalidate the write permissions, in case security policy has
 308	 * changed since the files were opened.
 309	 */
 310	ret = security_file_permission(file, MAY_WRITE);
 311	if (ret)
 312		return ret;
 313
 314	ret = fsnotify_file_area_perm(file, MAY_WRITE, &offset, len);
 315	if (ret)
 316		return ret;
 317
 318	if (S_ISFIFO(inode->i_mode))
 319		return -ESPIPE;
 320
 321	if (S_ISDIR(inode->i_mode))
 322		return -EISDIR;
 323
 324	if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))
 325		return -ENODEV;
 326
 327	/* Check for wraparound */
 328	if (check_add_overflow(offset, len, &sum))
 329		return -EFBIG;
 330
 331	if (sum > inode->i_sb->s_maxbytes)
 332		return -EFBIG;
 333
 334	if (!file->f_op->fallocate)
 335		return -EOPNOTSUPP;
 336
 337	file_start_write(file);
 338	ret = file->f_op->fallocate(file, mode, offset, len);
 339
 340	/*
 341	 * Create inotify and fanotify events.
 342	 *
 343	 * To keep the logic simple always create events if fallocate succeeds.
 344	 * This implies that events are even created if the file size remains
 345	 * unchanged, e.g. when using flag FALLOC_FL_KEEP_SIZE.
 346	 */
 347	if (ret == 0)
 348		fsnotify_modify(file);
 349
 350	file_end_write(file);
 351	return ret;
 352}
 353EXPORT_SYMBOL_GPL(vfs_fallocate);
 354
 355int ksys_fallocate(int fd, int mode, loff_t offset, loff_t len)
 356{
 357	CLASS(fd, f)(fd);
 358
 359	if (fd_empty(f))
 360		return -EBADF;
 361
 362	return vfs_fallocate(fd_file(f), mode, offset, len);
 363}
 364
 365SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len)
 366{
 367	return ksys_fallocate(fd, mode, offset, len);
 368}
 369
 370#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_FALLOCATE)
 371COMPAT_SYSCALL_DEFINE6(fallocate, int, fd, int, mode, compat_arg_u64_dual(offset),
 372		       compat_arg_u64_dual(len))
 373{
 374	return ksys_fallocate(fd, mode, compat_arg_u64_glue(offset),
 375			      compat_arg_u64_glue(len));
 376}
 377#endif
 378
 379/*
 380 * access() needs to use the real uid/gid, not the effective uid/gid.
 381 * We do this by temporarily clearing all FS-related capabilities and
 382 * switching the fsuid/fsgid around to the real ones.
 383 *
 384 * Creating new credentials is expensive, so we try to skip doing it,
 385 * which we can if the result would match what we already got.
 386 */
 387static bool access_need_override_creds(int flags)
 388{
 389	const struct cred *cred;
 390
 391	if (flags & AT_EACCESS)
 392		return false;
 393
 394	cred = current_cred();
 395	if (!uid_eq(cred->fsuid, cred->uid) ||
 396	    !gid_eq(cred->fsgid, cred->gid))
 397		return true;
 398
 399	if (!issecure(SECURE_NO_SETUID_FIXUP)) {
 400		kuid_t root_uid = make_kuid(cred->user_ns, 0);
 401		if (!uid_eq(cred->uid, root_uid)) {
 402			if (!cap_isclear(cred->cap_effective))
 403				return true;
 404		} else {
 405			if (!cap_isidentical(cred->cap_effective,
 406			    cred->cap_permitted))
 407				return true;
 408		}
 409	}
 410
 411	return false;
 412}
 413
 414static const struct cred *access_override_creds(void)
 415{
 416	struct cred *override_cred;
 417
 418	override_cred = prepare_creds();
 419	if (!override_cred)
 420		return NULL;
 421
 422	/*
 423	 * XXX access_need_override_creds performs checks in hopes of skipping
 424	 * this work. Make sure it stays in sync if making any changes in this
 425	 * routine.
 426	 */
 427
 428	override_cred->fsuid = override_cred->uid;
 429	override_cred->fsgid = override_cred->gid;
 430
 431	if (!issecure(SECURE_NO_SETUID_FIXUP)) {
 432		/* Clear the capabilities if we switch to a non-root user */
 433		kuid_t root_uid = make_kuid(override_cred->user_ns, 0);
 434		if (!uid_eq(override_cred->uid, root_uid))
 435			cap_clear(override_cred->cap_effective);
 436		else
 437			override_cred->cap_effective =
 438				override_cred->cap_permitted;
 439	}
 440
 441	/*
 442	 * The new set of credentials can *only* be used in
 443	 * task-synchronous circumstances, and does not need
 444	 * RCU freeing, unless somebody then takes a separate
 445	 * reference to it.
 446	 *
 447	 * NOTE! This is _only_ true because this credential
 448	 * is used purely for override_creds() that installs
 449	 * it as the subjective cred. Other threads will be
 450	 * accessing ->real_cred, not the subjective cred.
 451	 *
 452	 * If somebody _does_ make a copy of this (using the
 453	 * 'get_current_cred()' function), that will clear the
 454	 * non_rcu field, because now that other user may be
 455	 * expecting RCU freeing. But normal thread-synchronous
 456	 * cred accesses will keep things non-racy to avoid RCU
 457	 * freeing.
 458	 */
 459	override_cred->non_rcu = 1;
 460	return override_creds(override_cred);
 461}
 462
 463static int do_faccessat(int dfd, const char __user *filename, int mode, int flags)
 464{
 465	struct path path;
 466	struct inode *inode;
 467	int res;
 468	unsigned int lookup_flags = LOOKUP_FOLLOW;
 469	const struct cred *old_cred = NULL;
 470
 471	if (mode & ~S_IRWXO)	/* where's F_OK, X_OK, W_OK, R_OK? */
 472		return -EINVAL;
 473
 474	if (flags & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH))
 475		return -EINVAL;
 476
 477	if (flags & AT_SYMLINK_NOFOLLOW)
 478		lookup_flags &= ~LOOKUP_FOLLOW;
 479	if (flags & AT_EMPTY_PATH)
 480		lookup_flags |= LOOKUP_EMPTY;
 481
 482	if (access_need_override_creds(flags)) {
 483		old_cred = access_override_creds();
 484		if (!old_cred)
 485			return -ENOMEM;
 486	}
 487
 488retry:
 489	res = user_path_at(dfd, filename, lookup_flags, &path);
 490	if (res)
 491		goto out;
 492
 493	inode = d_backing_inode(path.dentry);
 494
 495	if ((mode & MAY_EXEC) && S_ISREG(inode->i_mode)) {
 496		/*
 497		 * MAY_EXEC on regular files is denied if the fs is mounted
 498		 * with the "noexec" flag.
 499		 */
 500		res = -EACCES;
 501		if (path_noexec(&path))
 502			goto out_path_release;
 503	}
 504
 505	res = inode_permission(mnt_idmap(path.mnt), inode, mode | MAY_ACCESS);
 506	/* SuS v2 requires we report a read only fs too */
 507	if (res || !(mode & S_IWOTH) || special_file(inode->i_mode))
 508		goto out_path_release;
 509	/*
 510	 * This is a rare case where using __mnt_is_readonly()
 511	 * is OK without a mnt_want/drop_write() pair.  Since
 512	 * no actual write to the fs is performed here, we do
 513	 * not need to telegraph to that to anyone.
 514	 *
 515	 * By doing this, we accept that this access is
 516	 * inherently racy and know that the fs may change
 517	 * state before we even see this result.
 518	 */
 519	if (__mnt_is_readonly(path.mnt))
 520		res = -EROFS;
 521
 522out_path_release:
 523	path_put(&path);
 524	if (retry_estale(res, lookup_flags)) {
 525		lookup_flags |= LOOKUP_REVAL;
 526		goto retry;
 527	}
 528out:
 529	if (old_cred)
 530		put_cred(revert_creds(old_cred));
 531
 532	return res;
 533}
 534
 535SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode)
 536{
 537	return do_faccessat(dfd, filename, mode, 0);
 538}
 539
 540SYSCALL_DEFINE4(faccessat2, int, dfd, const char __user *, filename, int, mode,
 541		int, flags)
 542{
 543	return do_faccessat(dfd, filename, mode, flags);
 544}
 545
 546SYSCALL_DEFINE2(access, const char __user *, filename, int, mode)
 547{
 548	return do_faccessat(AT_FDCWD, filename, mode, 0);
 549}
 550
 551SYSCALL_DEFINE1(chdir, const char __user *, filename)
 552{
 553	struct path path;
 554	int error;
 555	unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
 556retry:
 557	error = user_path_at(AT_FDCWD, filename, lookup_flags, &path);
 558	if (error)
 559		goto out;
 560
 561	error = path_permission(&path, MAY_EXEC | MAY_CHDIR);
 562	if (error)
 563		goto dput_and_out;
 564
 565	set_fs_pwd(current->fs, &path);
 566
 567dput_and_out:
 568	path_put(&path);
 569	if (retry_estale(error, lookup_flags)) {
 570		lookup_flags |= LOOKUP_REVAL;
 571		goto retry;
 572	}
 573out:
 574	return error;
 575}
 576
 577SYSCALL_DEFINE1(fchdir, unsigned int, fd)
 578{
 579	CLASS(fd_raw, f)(fd);
 580	int error;
 581
 582	if (fd_empty(f))
 583		return -EBADF;
 584
 585	if (!d_can_lookup(fd_file(f)->f_path.dentry))
 586		return -ENOTDIR;
 587
 588	error = file_permission(fd_file(f), MAY_EXEC | MAY_CHDIR);
 589	if (!error)
 590		set_fs_pwd(current->fs, &fd_file(f)->f_path);
 591	return error;
 592}
 593
 594SYSCALL_DEFINE1(chroot, const char __user *, filename)
 595{
 596	struct path path;
 597	int error;
 598	unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
 599retry:
 600	error = user_path_at(AT_FDCWD, filename, lookup_flags, &path);
 601	if (error)
 602		goto out;
 603
 604	error = path_permission(&path, MAY_EXEC | MAY_CHDIR);
 605	if (error)
 606		goto dput_and_out;
 607
 608	error = -EPERM;
 609	if (!ns_capable(current_user_ns(), CAP_SYS_CHROOT))
 610		goto dput_and_out;
 611	error = security_path_chroot(&path);
 612	if (error)
 613		goto dput_and_out;
 614
 615	set_fs_root(current->fs, &path);
 616	error = 0;
 617dput_and_out:
 618	path_put(&path);
 619	if (retry_estale(error, lookup_flags)) {
 620		lookup_flags |= LOOKUP_REVAL;
 621		goto retry;
 622	}
 623out:
 624	return error;
 625}
 626
 627int chmod_common(const struct path *path, umode_t mode)
 628{
 629	struct inode *inode = path->dentry->d_inode;
 630	struct inode *delegated_inode = NULL;
 631	struct iattr newattrs;
 632	int error;
 633
 634	error = mnt_want_write(path->mnt);
 635	if (error)
 636		return error;
 637retry_deleg:
 638	inode_lock(inode);
 639	error = security_path_chmod(path, mode);
 640	if (error)
 641		goto out_unlock;
 642	newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
 643	newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
 644	error = notify_change(mnt_idmap(path->mnt), path->dentry,
 645			      &newattrs, &delegated_inode);
 646out_unlock:
 647	inode_unlock(inode);
 648	if (delegated_inode) {
 649		error = break_deleg_wait(&delegated_inode);
 650		if (!error)
 651			goto retry_deleg;
 652	}
 653	mnt_drop_write(path->mnt);
 654	return error;
 655}
 656
 657int vfs_fchmod(struct file *file, umode_t mode)
 658{
 659	audit_file(file);
 660	return chmod_common(&file->f_path, mode);
 661}
 662
 663SYSCALL_DEFINE2(fchmod, unsigned int, fd, umode_t, mode)
 664{
 665	CLASS(fd, f)(fd);
 666
 667	if (fd_empty(f))
 668		return -EBADF;
 669
 670	return vfs_fchmod(fd_file(f), mode);
 671}
 672
 673static int do_fchmodat(int dfd, const char __user *filename, umode_t mode,
 674		       unsigned int flags)
 675{
 676	struct path path;
 677	int error;
 678	unsigned int lookup_flags;
 679
 680	if (unlikely(flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)))
 681		return -EINVAL;
 682
 683	lookup_flags = (flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW;
 684	if (flags & AT_EMPTY_PATH)
 685		lookup_flags |= LOOKUP_EMPTY;
 686
 687retry:
 688	error = user_path_at(dfd, filename, lookup_flags, &path);
 689	if (!error) {
 690		error = chmod_common(&path, mode);
 691		path_put(&path);
 692		if (retry_estale(error, lookup_flags)) {
 693			lookup_flags |= LOOKUP_REVAL;
 694			goto retry;
 695		}
 696	}
 697	return error;
 698}
 699
 700SYSCALL_DEFINE4(fchmodat2, int, dfd, const char __user *, filename,
 701		umode_t, mode, unsigned int, flags)
 702{
 703	return do_fchmodat(dfd, filename, mode, flags);
 704}
 705
 706SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename,
 707		umode_t, mode)
 708{
 709	return do_fchmodat(dfd, filename, mode, 0);
 710}
 711
 712SYSCALL_DEFINE2(chmod, const char __user *, filename, umode_t, mode)
 713{
 714	return do_fchmodat(AT_FDCWD, filename, mode, 0);
 715}
 716
 717/*
 718 * Check whether @kuid is valid and if so generate and set vfsuid_t in
 719 * ia_vfsuid.
 720 *
 721 * Return: true if @kuid is valid, false if not.
 722 */
 723static inline bool setattr_vfsuid(struct iattr *attr, kuid_t kuid)
 724{
 725	if (!uid_valid(kuid))
 726		return false;
 727	attr->ia_valid |= ATTR_UID;
 728	attr->ia_vfsuid = VFSUIDT_INIT(kuid);
 729	return true;
 730}
 731
 732/*
 733 * Check whether @kgid is valid and if so generate and set vfsgid_t in
 734 * ia_vfsgid.
 735 *
 736 * Return: true if @kgid is valid, false if not.
 737 */
 738static inline bool setattr_vfsgid(struct iattr *attr, kgid_t kgid)
 739{
 740	if (!gid_valid(kgid))
 741		return false;
 742	attr->ia_valid |= ATTR_GID;
 743	attr->ia_vfsgid = VFSGIDT_INIT(kgid);
 744	return true;
 745}
 746
 747int chown_common(const struct path *path, uid_t user, gid_t group)
 748{
 749	struct mnt_idmap *idmap;
 750	struct user_namespace *fs_userns;
 751	struct inode *inode = path->dentry->d_inode;
 752	struct inode *delegated_inode = NULL;
 753	int error;
 754	struct iattr newattrs;
 755	kuid_t uid;
 756	kgid_t gid;
 757
 758	uid = make_kuid(current_user_ns(), user);
 759	gid = make_kgid(current_user_ns(), group);
 760
 761	idmap = mnt_idmap(path->mnt);
 762	fs_userns = i_user_ns(inode);
 763
 764retry_deleg:
 765	newattrs.ia_vfsuid = INVALID_VFSUID;
 766	newattrs.ia_vfsgid = INVALID_VFSGID;
 767	newattrs.ia_valid =  ATTR_CTIME;
 768	if ((user != (uid_t)-1) && !setattr_vfsuid(&newattrs, uid))
 769		return -EINVAL;
 770	if ((group != (gid_t)-1) && !setattr_vfsgid(&newattrs, gid))
 771		return -EINVAL;
 772	inode_lock(inode);
 773	if (!S_ISDIR(inode->i_mode))
 774		newattrs.ia_valid |= ATTR_KILL_SUID | ATTR_KILL_PRIV |
 775				     setattr_should_drop_sgid(idmap, inode);
 776	/* Continue to send actual fs values, not the mount values. */
 777	error = security_path_chown(
 778		path,
 779		from_vfsuid(idmap, fs_userns, newattrs.ia_vfsuid),
 780		from_vfsgid(idmap, fs_userns, newattrs.ia_vfsgid));
 781	if (!error)
 782		error = notify_change(idmap, path->dentry, &newattrs,
 783				      &delegated_inode);
 784	inode_unlock(inode);
 785	if (delegated_inode) {
 786		error = break_deleg_wait(&delegated_inode);
 787		if (!error)
 788			goto retry_deleg;
 789	}
 790	return error;
 791}
 792
 793int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group,
 794		int flag)
 795{
 796	struct path path;
 797	int error = -EINVAL;
 798	int lookup_flags;
 799
 800	if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
 801		goto out;
 802
 803	lookup_flags = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW;
 804	if (flag & AT_EMPTY_PATH)
 805		lookup_flags |= LOOKUP_EMPTY;
 806retry:
 807	error = user_path_at(dfd, filename, lookup_flags, &path);
 808	if (error)
 809		goto out;
 810	error = mnt_want_write(path.mnt);
 811	if (error)
 812		goto out_release;
 813	error = chown_common(&path, user, group);
 814	mnt_drop_write(path.mnt);
 815out_release:
 816	path_put(&path);
 817	if (retry_estale(error, lookup_flags)) {
 818		lookup_flags |= LOOKUP_REVAL;
 819		goto retry;
 820	}
 821out:
 822	return error;
 823}
 824
 825SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user,
 826		gid_t, group, int, flag)
 827{
 828	return do_fchownat(dfd, filename, user, group, flag);
 829}
 830
 831SYSCALL_DEFINE3(chown, const char __user *, filename, uid_t, user, gid_t, group)
 832{
 833	return do_fchownat(AT_FDCWD, filename, user, group, 0);
 834}
 835
 836SYSCALL_DEFINE3(lchown, const char __user *, filename, uid_t, user, gid_t, group)
 837{
 838	return do_fchownat(AT_FDCWD, filename, user, group,
 839			   AT_SYMLINK_NOFOLLOW);
 840}
 841
 842int vfs_fchown(struct file *file, uid_t user, gid_t group)
 843{
 844	int error;
 845
 846	error = mnt_want_write_file(file);
 847	if (error)
 848		return error;
 849	audit_file(file);
 850	error = chown_common(&file->f_path, user, group);
 851	mnt_drop_write_file(file);
 852	return error;
 853}
 854
 855int ksys_fchown(unsigned int fd, uid_t user, gid_t group)
 856{
 857	CLASS(fd, f)(fd);
 858
 859	if (fd_empty(f))
 860		return -EBADF;
 861
 862	return vfs_fchown(fd_file(f), user, group);
 863}
 864
 865SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group)
 866{
 867	return ksys_fchown(fd, user, group);
 868}
 869
 870static inline int file_get_write_access(struct file *f)
 871{
 872	int error;
 873
 874	error = get_write_access(f->f_inode);
 875	if (unlikely(error))
 876		return error;
 877	error = mnt_get_write_access(f->f_path.mnt);
 878	if (unlikely(error))
 879		goto cleanup_inode;
 880	if (unlikely(f->f_mode & FMODE_BACKING)) {
 881		error = mnt_get_write_access(backing_file_user_path(f)->mnt);
 882		if (unlikely(error))
 883			goto cleanup_mnt;
 884	}
 885	return 0;
 886
 887cleanup_mnt:
 888	mnt_put_write_access(f->f_path.mnt);
 889cleanup_inode:
 890	put_write_access(f->f_inode);
 891	return error;
 892}
 893
 894static int do_dentry_open(struct file *f,
 895			  int (*open)(struct inode *, struct file *))
 896{
 897	static const struct file_operations empty_fops = {};
 898	struct inode *inode = f->f_path.dentry->d_inode;
 899	int error;
 900
 901	path_get(&f->f_path);
 902	f->f_inode = inode;
 903	f->f_mapping = inode->i_mapping;
 904	f->f_wb_err = filemap_sample_wb_err(f->f_mapping);
 905	f->f_sb_err = file_sample_sb_err(f);
 906
 907	if (unlikely(f->f_flags & O_PATH)) {
 908		f->f_mode = FMODE_PATH | FMODE_OPENED;
 909		file_set_fsnotify_mode(f, FMODE_NONOTIFY);
 910		f->f_op = &empty_fops;
 911		return 0;
 912	}
 913
 914	if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) {
 915		i_readcount_inc(inode);
 916	} else if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) {
 917		error = file_get_write_access(f);
 918		if (unlikely(error))
 919			goto cleanup_file;
 920		f->f_mode |= FMODE_WRITER;
 921	}
 922
 923	/* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */
 924	if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))
 925		f->f_mode |= FMODE_ATOMIC_POS;
 926
 927	f->f_op = fops_get(inode->i_fop);
 928	if (WARN_ON(!f->f_op)) {
 929		error = -ENODEV;
 930		goto cleanup_all;
 931	}
 932
 933	error = security_file_open(f);
 934	if (error)
 935		goto cleanup_all;
 936
 937	/*
 938	 * Set FMODE_NONOTIFY_* bits according to existing permission watches.
 939	 * If FMODE_NONOTIFY mode was already set for an fanotify fd or for a
 940	 * pseudo file, this call will not change the mode.
 941	 */
 942	file_set_fsnotify_mode_from_watchers(f);
 943	error = fsnotify_open_perm(f);
 944	if (error)
 945		goto cleanup_all;
 946
 947	error = break_lease(file_inode(f), f->f_flags);
 948	if (error)
 949		goto cleanup_all;
 950
 951	/* normally all 3 are set; ->open() can clear them if needed */
 952	f->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
 953	if (!open)
 954		open = f->f_op->open;
 955	if (open) {
 956		error = open(inode, f);
 957		if (error)
 958			goto cleanup_all;
 959	}
 960	f->f_mode |= FMODE_OPENED;
 961	if ((f->f_mode & FMODE_READ) &&
 962	     likely(f->f_op->read || f->f_op->read_iter))
 963		f->f_mode |= FMODE_CAN_READ;
 964	if ((f->f_mode & FMODE_WRITE) &&
 965	     likely(f->f_op->write || f->f_op->write_iter))
 966		f->f_mode |= FMODE_CAN_WRITE;
 967	if ((f->f_mode & FMODE_LSEEK) && !f->f_op->llseek)
 968		f->f_mode &= ~FMODE_LSEEK;
 969	if (f->f_mapping->a_ops && f->f_mapping->a_ops->direct_IO)
 970		f->f_mode |= FMODE_CAN_ODIRECT;
 971
 972	f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
 973	f->f_iocb_flags = iocb_flags(f);
 974
 975	file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping);
 976
 977	if ((f->f_flags & O_DIRECT) && !(f->f_mode & FMODE_CAN_ODIRECT))
 978		return -EINVAL;
 979
 980	/*
 981	 * XXX: Huge page cache doesn't support writing yet. Drop all page
 982	 * cache for this file before processing writes.
 983	 */
 984	if (f->f_mode & FMODE_WRITE) {
 985		/*
 986		 * Depends on full fence from get_write_access() to synchronize
 987		 * against collapse_file() regarding i_writecount and nr_thps
 988		 * updates. Ensures subsequent insertion of THPs into the page
 989		 * cache will fail.
 990		 */
 991		if (filemap_nr_thps(inode->i_mapping)) {
 992			struct address_space *mapping = inode->i_mapping;
 993
 994			filemap_invalidate_lock(inode->i_mapping);
 995			/*
 996			 * unmap_mapping_range just need to be called once
 997			 * here, because the private pages is not need to be
 998			 * unmapped mapping (e.g. data segment of dynamic
 999			 * shared libraries here).
1000			 */
1001			unmap_mapping_range(mapping, 0, 0, 0);
1002			truncate_inode_pages(mapping, 0);
1003			filemap_invalidate_unlock(inode->i_mapping);
1004		}
1005	}
1006
1007	return 0;
1008
1009cleanup_all:
1010	if (WARN_ON_ONCE(error > 0))
1011		error = -EINVAL;
1012	fops_put(f->f_op);
1013	put_file_access(f);
1014cleanup_file:
1015	path_put(&f->f_path);
1016	f->f_path.mnt = NULL;
1017	f->f_path.dentry = NULL;
1018	f->f_inode = NULL;
1019	return error;
1020}
1021
1022/**
1023 * finish_open - finish opening a file
1024 * @file: file pointer
1025 * @dentry: pointer to dentry
1026 * @open: open callback
1027 *
1028 * This can be used to finish opening a file passed to i_op->atomic_open().
1029 *
1030 * If the open callback is set to NULL, then the standard f_op->open()
1031 * filesystem callback is substituted.
1032 *
1033 * NB: the dentry reference is _not_ consumed.  If, for example, the dentry is
1034 * the return value of d_splice_alias(), then the caller needs to perform dput()
1035 * on it after finish_open().
1036 *
1037 * Returns zero on success or -errno if the open failed.
1038 */
1039int finish_open(struct file *file, struct dentry *dentry,
1040		int (*open)(struct inode *, struct file *))
1041{
1042	BUG_ON(file->f_mode & FMODE_OPENED); /* once it's opened, it's opened */
1043
1044	file->f_path.dentry = dentry;
1045	return do_dentry_open(file, open);
1046}
1047EXPORT_SYMBOL(finish_open);
1048
1049/**
1050 * finish_no_open - finish ->atomic_open() without opening the file
1051 *
1052 * @file: file pointer
1053 * @dentry: dentry or NULL (as returned from ->lookup())
1054 *
1055 * This can be used to set the result of a successful lookup in ->atomic_open().
1056 *
1057 * NB: unlike finish_open() this function does consume the dentry reference and
1058 * the caller need not dput() it.
1059 *
1060 * Returns "0" which must be the return value of ->atomic_open() after having
1061 * called this function.
1062 */
1063int finish_no_open(struct file *file, struct dentry *dentry)
1064{
1065	file->f_path.dentry = dentry;
1066	return 0;
1067}
1068EXPORT_SYMBOL(finish_no_open);
1069
1070char *file_path(struct file *filp, char *buf, int buflen)
1071{
1072	return d_path(&filp->f_path, buf, buflen);
1073}
1074EXPORT_SYMBOL(file_path);
1075
1076/**
1077 * vfs_open - open the file at the given path
1078 * @path: path to open
1079 * @file: newly allocated file with f_flag initialized
1080 */
1081int vfs_open(const struct path *path, struct file *file)
1082{
1083	int ret;
1084
1085	file->f_path = *path;
1086	ret = do_dentry_open(file, NULL);
1087	if (!ret) {
1088		/*
1089		 * Once we return a file with FMODE_OPENED, __fput() will call
1090		 * fsnotify_close(), so we need fsnotify_open() here for
1091		 * symmetry.
1092		 */
1093		fsnotify_open(file);
1094	}
1095	return ret;
1096}
1097
1098struct file *dentry_open(const struct path *path, int flags,
1099			 const struct cred *cred)
1100{
1101	int error;
1102	struct file *f;
1103
1104	/* We must always pass in a valid mount pointer. */
1105	BUG_ON(!path->mnt);
1106
1107	f = alloc_empty_file(flags, cred);
1108	if (!IS_ERR(f)) {
1109		error = vfs_open(path, f);
1110		if (error) {
1111			fput(f);
1112			f = ERR_PTR(error);
1113		}
1114	}
1115	return f;
1116}
1117EXPORT_SYMBOL(dentry_open);
1118
1119struct file *dentry_open_nonotify(const struct path *path, int flags,
1120				  const struct cred *cred)
1121{
1122	struct file *f = alloc_empty_file(flags, cred);
1123	if (!IS_ERR(f)) {
1124		int error;
1125
1126		file_set_fsnotify_mode(f, FMODE_NONOTIFY);
1127		error = vfs_open(path, f);
1128		if (error) {
1129			fput(f);
1130			f = ERR_PTR(error);
1131		}
1132	}
1133	return f;
1134}
1135
1136/**
1137 * dentry_create - Create and open a file
1138 * @path: path to create
1139 * @flags: O_ flags
1140 * @mode: mode bits for new file
1141 * @cred: credentials to use
1142 *
1143 * Caller must hold the parent directory's lock, and have prepared
1144 * a negative dentry, placed in @path->dentry, for the new file.
1145 *
1146 * Caller sets @path->mnt to the vfsmount of the filesystem where
1147 * the new file is to be created. The parent directory and the
1148 * negative dentry must reside on the same filesystem instance.
1149 *
1150 * On success, returns a "struct file *". Otherwise a ERR_PTR
1151 * is returned.
1152 */
1153struct file *dentry_create(const struct path *path, int flags, umode_t mode,
1154			   const struct cred *cred)
1155{
1156	struct file *f;
1157	int error;
1158
1159	f = alloc_empty_file(flags, cred);
1160	if (IS_ERR(f))
1161		return f;
1162
1163	error = vfs_create(mnt_idmap(path->mnt),
1164			   d_inode(path->dentry->d_parent),
1165			   path->dentry, mode, true);
1166	if (!error)
1167		error = vfs_open(path, f);
1168
1169	if (unlikely(error)) {
1170		fput(f);
1171		return ERR_PTR(error);
1172	}
1173	return f;
1174}
1175EXPORT_SYMBOL(dentry_create);
1176
1177/**
1178 * kernel_file_open - open a file for kernel internal use
1179 * @path:	path of the file to open
1180 * @flags:	open flags
1181 * @cred:	credentials for open
1182 *
1183 * Open a file for use by in-kernel consumers. The file is not accounted
1184 * against nr_files and must not be installed into the file descriptor
1185 * table.
1186 *
1187 * Return: Opened file on success, an error pointer on failure.
1188 */
1189struct file *kernel_file_open(const struct path *path, int flags,
1190				const struct cred *cred)
1191{
1192	struct file *f;
1193	int error;
1194
1195	f = alloc_empty_file_noaccount(flags, cred);
1196	if (IS_ERR(f))
1197		return f;
1198
1199	f->f_path = *path;
1200	error = do_dentry_open(f, NULL);
1201	if (error) {
1202		fput(f);
1203		return ERR_PTR(error);
1204	}
1205
1206	fsnotify_open(f);
1207	return f;
1208}
1209EXPORT_SYMBOL_GPL(kernel_file_open);
1210
1211#define WILL_CREATE(flags)	(flags & (O_CREAT | __O_TMPFILE))
1212#define O_PATH_FLAGS		(O_DIRECTORY | O_NOFOLLOW | O_PATH | O_CLOEXEC)
1213
1214inline struct open_how build_open_how(int flags, umode_t mode)
1215{
1216	struct open_how how = {
1217		.flags = flags & VALID_OPEN_FLAGS,
1218		.mode = mode & S_IALLUGO,
1219	};
1220
1221	/* O_PATH beats everything else. */
1222	if (how.flags & O_PATH)
1223		how.flags &= O_PATH_FLAGS;
1224	/* Modes should only be set for create-like flags. */
1225	if (!WILL_CREATE(how.flags))
1226		how.mode = 0;
1227	return how;
1228}
1229
1230inline int build_open_flags(const struct open_how *how, struct open_flags *op)
1231{
1232	u64 flags = how->flags;
1233	u64 strip = O_CLOEXEC;
1234	int lookup_flags = 0;
1235	int acc_mode = ACC_MODE(flags);
1236
1237	BUILD_BUG_ON_MSG(upper_32_bits(VALID_OPEN_FLAGS),
1238			 "struct open_flags doesn't yet handle flags > 32 bits");
1239
1240	/*
1241	 * Strip flags that aren't relevant in determining struct open_flags.
1242	 */
1243	flags &= ~strip;
1244
1245	/*
1246	 * Older syscalls implicitly clear all of the invalid flags or argument
1247	 * values before calling build_open_flags(), but openat2(2) checks all
1248	 * of its arguments.
1249	 */
1250	if (flags & ~VALID_OPEN_FLAGS)
1251		return -EINVAL;
1252	if (how->resolve & ~VALID_RESOLVE_FLAGS)
1253		return -EINVAL;
1254
1255	/* Scoping flags are mutually exclusive. */
1256	if ((how->resolve & RESOLVE_BENEATH) && (how->resolve & RESOLVE_IN_ROOT))
1257		return -EINVAL;
1258
1259	/* Deal with the mode. */
1260	if (WILL_CREATE(flags)) {
1261		if (how->mode & ~S_IALLUGO)
1262			return -EINVAL;
1263		op->mode = how->mode | S_IFREG;
1264	} else {
1265		if (how->mode != 0)
1266			return -EINVAL;
1267		op->mode = 0;
1268	}
1269
1270	/*
1271	 * Block bugs where O_DIRECTORY | O_CREAT created regular files.
1272	 * Note, that blocking O_DIRECTORY | O_CREAT here also protects
1273	 * O_TMPFILE below which requires O_DIRECTORY being raised.
1274	 */
1275	if ((flags & (O_DIRECTORY | O_CREAT)) == (O_DIRECTORY | O_CREAT))
1276		return -EINVAL;
1277
1278	/* Now handle the creative implementation of O_TMPFILE. */
1279	if (flags & __O_TMPFILE) {
1280		/*
1281		 * In order to ensure programs get explicit errors when trying
1282		 * to use O_TMPFILE on old kernels we enforce that O_DIRECTORY
1283		 * is raised alongside __O_TMPFILE.
1284		 */
1285		if (!(flags & O_DIRECTORY))
1286			return -EINVAL;
1287		if (!(acc_mode & MAY_WRITE))
1288			return -EINVAL;
1289	}
1290	if (flags & O_PATH) {
1291		/* O_PATH only permits certain other flags to be set. */
1292		if (flags & ~O_PATH_FLAGS)
1293			return -EINVAL;
1294		acc_mode = 0;
1295	}
1296
1297	/*
1298	 * O_SYNC is implemented as __O_SYNC|O_DSYNC.  As many places only
1299	 * check for O_DSYNC if the need any syncing at all we enforce it's
1300	 * always set instead of having to deal with possibly weird behaviour
1301	 * for malicious applications setting only __O_SYNC.
1302	 */
1303	if (flags & __O_SYNC)
1304		flags |= O_DSYNC;
1305
1306	op->open_flag = flags;
1307
1308	/* O_TRUNC implies we need access checks for write permissions */
1309	if (flags & O_TRUNC)
1310		acc_mode |= MAY_WRITE;
1311
1312	/* Allow the LSM permission hook to distinguish append
1313	   access from general write access. */
1314	if (flags & O_APPEND)
1315		acc_mode |= MAY_APPEND;
1316
1317	op->acc_mode = acc_mode;
1318
1319	op->intent = flags & O_PATH ? 0 : LOOKUP_OPEN;
1320
1321	if (flags & O_CREAT) {
1322		op->intent |= LOOKUP_CREATE;
1323		if (flags & O_EXCL) {
1324			op->intent |= LOOKUP_EXCL;
1325			flags |= O_NOFOLLOW;
1326		}
1327	}
1328
1329	if (flags & O_DIRECTORY)
1330		lookup_flags |= LOOKUP_DIRECTORY;
1331	if (!(flags & O_NOFOLLOW))
1332		lookup_flags |= LOOKUP_FOLLOW;
1333
1334	if (how->resolve & RESOLVE_NO_XDEV)
1335		lookup_flags |= LOOKUP_NO_XDEV;
1336	if (how->resolve & RESOLVE_NO_MAGICLINKS)
1337		lookup_flags |= LOOKUP_NO_MAGICLINKS;
1338	if (how->resolve & RESOLVE_NO_SYMLINKS)
1339		lookup_flags |= LOOKUP_NO_SYMLINKS;
1340	if (how->resolve & RESOLVE_BENEATH)
1341		lookup_flags |= LOOKUP_BENEATH;
1342	if (how->resolve & RESOLVE_IN_ROOT)
1343		lookup_flags |= LOOKUP_IN_ROOT;
1344	if (how->resolve & RESOLVE_CACHED) {
1345		/* Don't bother even trying for create/truncate/tmpfile open */
1346		if (flags & (O_TRUNC | O_CREAT | __O_TMPFILE))
1347			return -EAGAIN;
1348		lookup_flags |= LOOKUP_CACHED;
1349	}
1350
1351	op->lookup_flags = lookup_flags;
1352	return 0;
1353}
1354
1355/**
1356 * file_open_name - open file and return file pointer
1357 *
1358 * @name:	struct filename containing path to open
1359 * @flags:	open flags as per the open(2) second argument
1360 * @mode:	mode for the new file if O_CREAT is set, else ignored
1361 *
1362 * This is the helper to open a file from kernelspace if you really
1363 * have to.  But in generally you should not do this, so please move
1364 * along, nothing to see here..
1365 */
1366struct file *file_open_name(struct filename *name, int flags, umode_t mode)
1367{
1368	struct open_flags op;
1369	struct open_how how = build_open_how(flags, mode);
1370	int err = build_open_flags(&how, &op);
1371	if (err)
1372		return ERR_PTR(err);
1373	return do_filp_open(AT_FDCWD, name, &op);
1374}
1375
1376/**
1377 * filp_open - open file and return file pointer
1378 *
1379 * @filename:	path to open
1380 * @flags:	open flags as per the open(2) second argument
1381 * @mode:	mode for the new file if O_CREAT is set, else ignored
1382 *
1383 * This is the helper to open a file from kernelspace if you really
1384 * have to.  But in generally you should not do this, so please move
1385 * along, nothing to see here..
1386 */
1387struct file *filp_open(const char *filename, int flags, umode_t mode)
1388{
1389	struct filename *name = getname_kernel(filename);
1390	struct file *file = ERR_CAST(name);
1391
1392	if (!IS_ERR(name)) {
1393		file = file_open_name(name, flags, mode);
1394		putname(name);
1395	}
1396	return file;
1397}
1398EXPORT_SYMBOL(filp_open);
1399
1400struct file *file_open_root(const struct path *root,
1401			    const char *filename, int flags, umode_t mode)
1402{
1403	struct open_flags op;
1404	struct open_how how = build_open_how(flags, mode);
1405	int err = build_open_flags(&how, &op);
1406	if (err)
1407		return ERR_PTR(err);
1408	return do_file_open_root(root, filename, &op);
1409}
1410EXPORT_SYMBOL(file_open_root);
1411
1412static int do_sys_openat2(int dfd, const char __user *filename,
1413			  struct open_how *how)
1414{
1415	struct open_flags op;
1416	struct filename *tmp;
1417	int err, fd;
1418
1419	err = build_open_flags(how, &op);
1420	if (unlikely(err))
1421		return err;
1422
1423	tmp = getname(filename);
1424	if (IS_ERR(tmp))
1425		return PTR_ERR(tmp);
1426
1427	fd = get_unused_fd_flags(how->flags);
1428	if (likely(fd >= 0)) {
1429		struct file *f = do_filp_open(dfd, tmp, &op);
1430		if (IS_ERR(f)) {
1431			put_unused_fd(fd);
1432			fd = PTR_ERR(f);
1433		} else {
1434			fd_install(fd, f);
1435		}
1436	}
1437	putname(tmp);
1438	return fd;
1439}
1440
1441int do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode)
1442{
1443	struct open_how how = build_open_how(flags, mode);
1444	return do_sys_openat2(dfd, filename, &how);
1445}
1446
1447
1448SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode)
1449{
1450	if (force_o_largefile())
1451		flags |= O_LARGEFILE;
1452	return do_sys_open(AT_FDCWD, filename, flags, mode);
1453}
1454
1455SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags,
1456		umode_t, mode)
1457{
1458	if (force_o_largefile())
1459		flags |= O_LARGEFILE;
1460	return do_sys_open(dfd, filename, flags, mode);
1461}
1462
1463SYSCALL_DEFINE4(openat2, int, dfd, const char __user *, filename,
1464		struct open_how __user *, how, size_t, usize)
1465{
1466	int err;
1467	struct open_how tmp;
1468
1469	BUILD_BUG_ON(sizeof(struct open_how) < OPEN_HOW_SIZE_VER0);
1470	BUILD_BUG_ON(sizeof(struct open_how) != OPEN_HOW_SIZE_LATEST);
1471
1472	if (unlikely(usize < OPEN_HOW_SIZE_VER0))
1473		return -EINVAL;
1474	if (unlikely(usize > PAGE_SIZE))
1475		return -E2BIG;
1476
1477	err = copy_struct_from_user(&tmp, sizeof(tmp), how, usize);
1478	if (err)
1479		return err;
1480
1481	audit_openat2_how(&tmp);
1482
1483	/* O_LARGEFILE is only allowed for non-O_PATH. */
1484	if (!(tmp.flags & O_PATH) && force_o_largefile())
1485		tmp.flags |= O_LARGEFILE;
1486
1487	return do_sys_openat2(dfd, filename, &tmp);
1488}
1489
1490#ifdef CONFIG_COMPAT
1491/*
1492 * Exactly like sys_open(), except that it doesn't set the
1493 * O_LARGEFILE flag.
1494 */
1495COMPAT_SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode)
1496{
1497	return do_sys_open(AT_FDCWD, filename, flags, mode);
1498}
1499
1500/*
1501 * Exactly like sys_openat(), except that it doesn't set the
1502 * O_LARGEFILE flag.
1503 */
1504COMPAT_SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, umode_t, mode)
1505{
1506	return do_sys_open(dfd, filename, flags, mode);
1507}
1508#endif
1509
1510#ifndef __alpha__
1511
1512/*
1513 * For backward compatibility?  Maybe this should be moved
1514 * into arch/i386 instead?
1515 */
1516SYSCALL_DEFINE2(creat, const char __user *, pathname, umode_t, mode)
1517{
1518	int flags = O_CREAT | O_WRONLY | O_TRUNC;
1519
1520	if (force_o_largefile())
1521		flags |= O_LARGEFILE;
1522	return do_sys_open(AT_FDCWD, pathname, flags, mode);
1523}
1524#endif
1525
1526/*
1527 * "id" is the POSIX thread ID. We use the
1528 * files pointer for this..
1529 */
1530static int filp_flush(struct file *filp, fl_owner_t id)
1531{
1532	int retval = 0;
1533
1534	if (CHECK_DATA_CORRUPTION(file_count(filp) == 0, filp,
1535			"VFS: Close: file count is 0 (f_op=%ps)",
1536			filp->f_op)) {
1537		return 0;
1538	}
1539
1540	if (filp->f_op->flush)
1541		retval = filp->f_op->flush(filp, id);
1542
1543	if (likely(!(filp->f_mode & FMODE_PATH))) {
1544		dnotify_flush(filp, id);
1545		locks_remove_posix(filp, id);
1546	}
1547	return retval;
1548}
1549
1550int filp_close(struct file *filp, fl_owner_t id)
1551{
1552	int retval;
1553
1554	retval = filp_flush(filp, id);
1555	fput_close(filp);
1556
1557	return retval;
1558}
1559EXPORT_SYMBOL(filp_close);
1560
1561/*
1562 * Careful here! We test whether the file pointer is NULL before
1563 * releasing the fd. This ensures that one clone task can't release
1564 * an fd while another clone is opening it.
1565 */
1566SYSCALL_DEFINE1(close, unsigned int, fd)
1567{
1568	int retval;
1569	struct file *file;
1570
1571	file = file_close_fd(fd);
1572	if (!file)
1573		return -EBADF;
1574
1575	retval = filp_flush(file, current->files);
1576
1577	/*
1578	 * We're returning to user space. Don't bother
1579	 * with any delayed fput() cases.
1580	 */
1581	fput_close_sync(file);
1582
1583	if (likely(retval == 0))
1584		return 0;
1585
1586	/* can't restart close syscall because file table entry was cleared */
1587	if (retval == -ERESTARTSYS ||
1588	    retval == -ERESTARTNOINTR ||
1589	    retval == -ERESTARTNOHAND ||
1590	    retval == -ERESTART_RESTARTBLOCK)
1591		retval = -EINTR;
1592
1593	return retval;
1594}
1595
1596/*
1597 * This routine simulates a hangup on the tty, to arrange that users
1598 * are given clean terminals at login time.
1599 */
1600SYSCALL_DEFINE0(vhangup)
1601{
1602	if (capable(CAP_SYS_TTY_CONFIG)) {
1603		tty_vhangup_self();
1604		return 0;
1605	}
1606	return -EPERM;
1607}
1608
1609/*
1610 * Called when an inode is about to be open.
1611 * We use this to disallow opening large files on 32bit systems if
1612 * the caller didn't specify O_LARGEFILE.  On 64bit systems we force
1613 * on this flag in sys_open.
1614 */
1615int generic_file_open(struct inode * inode, struct file * filp)
1616{
1617	if (!(filp->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
1618		return -EOVERFLOW;
1619	return 0;
1620}
1621
1622EXPORT_SYMBOL(generic_file_open);
1623
1624/*
1625 * This is used by subsystems that don't want seekable
1626 * file descriptors. The function is not supposed to ever fail, the only
1627 * reason it returns an 'int' and not 'void' is so that it can be plugged
1628 * directly into file_operations structure.
1629 */
1630int nonseekable_open(struct inode *inode, struct file *filp)
1631{
1632	filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
1633	return 0;
1634}
1635
1636EXPORT_SYMBOL(nonseekable_open);
1637
1638/*
1639 * stream_open is used by subsystems that want stream-like file descriptors.
1640 * Such file descriptors are not seekable and don't have notion of position
1641 * (file.f_pos is always 0 and ppos passed to .read()/.write() is always NULL).
1642 * Contrary to file descriptors of other regular files, .read() and .write()
1643 * can run simultaneously.
1644 *
1645 * stream_open never fails and is marked to return int so that it could be
1646 * directly used as file_operations.open .
1647 */
1648int stream_open(struct inode *inode, struct file *filp)
1649{
1650	filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE | FMODE_ATOMIC_POS);
1651	filp->f_mode |= FMODE_STREAM;
1652	return 0;
1653}
1654
1655EXPORT_SYMBOL(stream_open);