fs/file.c at master · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / fs / file.c
at master 40 kB view raw
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 *  linux/fs/file.c
   4 *
   5 *  Copyright (C) 1998-1999, Stephen Tweedie and Bill Hawes
   6 *
   7 *  Manage the dynamic fd arrays in the process files_struct.
   8 */
   9
  10#include <linux/syscalls.h>
  11#include <linux/export.h>
  12#include <linux/fs.h>
  13#include <linux/kernel.h>
  14#include <linux/mm.h>
  15#include <linux/sched/signal.h>
  16#include <linux/slab.h>
  17#include <linux/file.h>
  18#include <linux/fdtable.h>
  19#include <linux/bitops.h>
  20#include <linux/spinlock.h>
  21#include <linux/rcupdate.h>
  22#include <linux/close_range.h>
  23#include <linux/file_ref.h>
  24#include <net/sock.h>
  25#include <linux/init_task.h>
  26
  27#include "internal.h"
  28
  29static noinline bool __file_ref_put_badval(file_ref_t *ref, unsigned long cnt)
  30{
  31	/*
  32	 * If the reference count was already in the dead zone, then this
  33	 * put() operation is imbalanced. Warn, put the reference count back to
  34	 * DEAD and tell the caller to not deconstruct the object.
  35	 */
  36	if (WARN_ONCE(cnt >= FILE_REF_RELEASED, "imbalanced put on file reference count")) {
  37		atomic_long_set(&ref->refcnt, FILE_REF_DEAD);
  38		return false;
  39	}
  40
  41	/*
  42	 * This is a put() operation on a saturated refcount. Restore the
  43	 * mean saturation value and tell the caller to not deconstruct the
  44	 * object.
  45	 */
  46	if (cnt > FILE_REF_MAXREF)
  47		atomic_long_set(&ref->refcnt, FILE_REF_SATURATED);
  48	return false;
  49}
  50
  51/**
  52 * __file_ref_put - Slowpath of file_ref_put()
  53 * @ref:	Pointer to the reference count
  54 * @cnt:	Current reference count
  55 *
  56 * Invoked when the reference count is outside of the valid zone.
  57 *
  58 * Return:
  59 *	True if this was the last reference with no future references
  60 *	possible. This signals the caller that it can safely schedule the
  61 *	object, which is protected by the reference counter, for
  62 *	deconstruction.
  63 *
  64 *	False if there are still active references or the put() raced
  65 *	with a concurrent get()/put() pair. Caller is not allowed to
  66 *	deconstruct the protected object.
  67 */
  68bool __file_ref_put(file_ref_t *ref, unsigned long cnt)
  69{
  70	/* Did this drop the last reference? */
  71	if (likely(cnt == FILE_REF_NOREF)) {
  72		/*
  73		 * Carefully try to set the reference count to FILE_REF_DEAD.
  74		 *
  75		 * This can fail if a concurrent get() operation has
  76		 * elevated it again or the corresponding put() even marked
  77		 * it dead already. Both are valid situations and do not
  78		 * require a retry. If this fails the caller is not
  79		 * allowed to deconstruct the object.
  80		 */
  81		if (!atomic_long_try_cmpxchg_release(&ref->refcnt, &cnt, FILE_REF_DEAD))
  82			return false;
  83
  84		/*
  85		 * The caller can safely schedule the object for
  86		 * deconstruction. Provide acquire ordering.
  87		 */
  88		smp_acquire__after_ctrl_dep();
  89		return true;
  90	}
  91
  92	return __file_ref_put_badval(ref, cnt);
  93}
  94EXPORT_SYMBOL_GPL(__file_ref_put);
  95
  96unsigned int sysctl_nr_open __read_mostly = 1024*1024;
  97unsigned int sysctl_nr_open_min = BITS_PER_LONG;
  98/* our min() is unusable in constant expressions ;-/ */
  99#define __const_min(x, y) ((x) < (y) ? (x) : (y))
 100unsigned int sysctl_nr_open_max =
 101	__const_min(INT_MAX, ~(size_t)0/sizeof(void *)) & -BITS_PER_LONG;
 102
 103static void __free_fdtable(struct fdtable *fdt)
 104{
 105	kvfree(fdt->fd);
 106	kvfree(fdt->open_fds);
 107	kfree(fdt);
 108}
 109
 110static void free_fdtable_rcu(struct rcu_head *rcu)
 111{
 112	__free_fdtable(container_of(rcu, struct fdtable, rcu));
 113}
 114
 115#define BITBIT_NR(nr)	BITS_TO_LONGS(BITS_TO_LONGS(nr))
 116#define BITBIT_SIZE(nr)	(BITBIT_NR(nr) * sizeof(long))
 117
 118#define fdt_words(fdt) ((fdt)->max_fds / BITS_PER_LONG) // words in ->open_fds
 119/*
 120 * Copy 'count' fd bits from the old table to the new table and clear the extra
 121 * space if any.  This does not copy the file pointers.  Called with the files
 122 * spinlock held for write.
 123 */
 124static inline void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt,
 125			    unsigned int copy_words)
 126{
 127	unsigned int nwords = fdt_words(nfdt);
 128
 129	bitmap_copy_and_extend(nfdt->open_fds, ofdt->open_fds,
 130			copy_words * BITS_PER_LONG, nwords * BITS_PER_LONG);
 131	bitmap_copy_and_extend(nfdt->close_on_exec, ofdt->close_on_exec,
 132			copy_words * BITS_PER_LONG, nwords * BITS_PER_LONG);
 133	bitmap_copy_and_extend(nfdt->full_fds_bits, ofdt->full_fds_bits,
 134			copy_words, nwords);
 135}
 136
 137/*
 138 * Copy all file descriptors from the old table to the new, expanded table and
 139 * clear the extra space.  Called with the files spinlock held for write.
 140 */
 141static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt)
 142{
 143	size_t cpy, set;
 144
 145	BUG_ON(nfdt->max_fds < ofdt->max_fds);
 146
 147	cpy = ofdt->max_fds * sizeof(struct file *);
 148	set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *);
 149	memcpy(nfdt->fd, ofdt->fd, cpy);
 150	memset((char *)nfdt->fd + cpy, 0, set);
 151
 152	copy_fd_bitmaps(nfdt, ofdt, fdt_words(ofdt));
 153}
 154
 155/*
 156 * Note how the fdtable bitmap allocations very much have to be a multiple of
 157 * BITS_PER_LONG. This is not only because we walk those things in chunks of
 158 * 'unsigned long' in some places, but simply because that is how the Linux
 159 * kernel bitmaps are defined to work: they are not "bits in an array of bytes",
 160 * they are very much "bits in an array of unsigned long".
 161 */
 162static struct fdtable *alloc_fdtable(unsigned int slots_wanted)
 163{
 164	struct fdtable *fdt;
 165	unsigned int nr;
 166	void *data;
 167
 168	/*
 169	 * Figure out how many fds we actually want to support in this fdtable.
 170	 * Allocation steps are keyed to the size of the fdarray, since it
 171	 * grows far faster than any of the other dynamic data. We try to fit
 172	 * the fdarray into comfortable page-tuned chunks: starting at 1024B
 173	 * and growing in powers of two from there on.  Since we called only
 174	 * with slots_wanted > BITS_PER_LONG (embedded instance in files->fdtab
 175	 * already gives BITS_PER_LONG slots), the above boils down to
 176	 * 1.  use the smallest power of two large enough to give us that many
 177	 * slots.
 178	 * 2.  on 32bit skip 64 and 128 - the minimal capacity we want there is
 179	 * 256 slots (i.e. 1Kb fd array).
 180	 * 3.  on 64bit don't skip anything, 1Kb fd array means 128 slots there
 181	 * and we are never going to be asked for 64 or less.
 182	 */
 183	if (IS_ENABLED(CONFIG_32BIT) && slots_wanted < 256)
 184		nr = 256;
 185	else
 186		nr = roundup_pow_of_two(slots_wanted);
 187	/*
 188	 * Note that this can drive nr *below* what we had passed if sysctl_nr_open
 189	 * had been set lower between the check in expand_files() and here.
 190	 *
 191	 * We make sure that nr remains a multiple of BITS_PER_LONG - otherwise
 192	 * bitmaps handling below becomes unpleasant, to put it mildly...
 193	 */
 194	if (unlikely(nr > sysctl_nr_open)) {
 195		nr = round_down(sysctl_nr_open, BITS_PER_LONG);
 196		if (nr < slots_wanted)
 197			return ERR_PTR(-EMFILE);
 198	}
 199
 200	/*
 201	 * Check if the allocation size would exceed INT_MAX. kvmalloc_array()
 202	 * and kvmalloc() will warn if the allocation size is greater than
 203	 * INT_MAX, as filp_cachep objects are not __GFP_NOWARN.
 204	 *
 205	 * This can happen when sysctl_nr_open is set to a very high value and
 206	 * a process tries to use a file descriptor near that limit. For example,
 207	 * if sysctl_nr_open is set to 1073741816 (0x3ffffff8) - which is what
 208	 * systemd typically sets it to - then trying to use a file descriptor
 209	 * close to that value will require allocating a file descriptor table
 210	 * that exceeds 8GB in size.
 211	 */
 212	if (unlikely(nr > INT_MAX / sizeof(struct file *)))
 213		return ERR_PTR(-EMFILE);
 214
 215	fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_ACCOUNT);
 216	if (!fdt)
 217		goto out;
 218	fdt->max_fds = nr;
 219	data = kvmalloc_array(nr, sizeof(struct file *), GFP_KERNEL_ACCOUNT);
 220	if (!data)
 221		goto out_fdt;
 222	fdt->fd = data;
 223
 224	data = kvmalloc(max_t(size_t,
 225				 2 * nr / BITS_PER_BYTE + BITBIT_SIZE(nr), L1_CACHE_BYTES),
 226				 GFP_KERNEL_ACCOUNT);
 227	if (!data)
 228		goto out_arr;
 229	fdt->open_fds = data;
 230	data += nr / BITS_PER_BYTE;
 231	fdt->close_on_exec = data;
 232	data += nr / BITS_PER_BYTE;
 233	fdt->full_fds_bits = data;
 234
 235	return fdt;
 236
 237out_arr:
 238	kvfree(fdt->fd);
 239out_fdt:
 240	kfree(fdt);
 241out:
 242	return ERR_PTR(-ENOMEM);
 243}
 244
 245/*
 246 * Expand the file descriptor table.
 247 * This function will allocate a new fdtable and both fd array and fdset, of
 248 * the given size.
 249 * Return <0 error code on error; 0 on successful completion.
 250 * The files->file_lock should be held on entry, and will be held on exit.
 251 */
 252static int expand_fdtable(struct files_struct *files, unsigned int nr)
 253	__releases(files->file_lock)
 254	__acquires(files->file_lock)
 255{
 256	struct fdtable *new_fdt, *cur_fdt;
 257
 258	spin_unlock(&files->file_lock);
 259	new_fdt = alloc_fdtable(nr + 1);
 260
 261	/* make sure all fd_install() have seen resize_in_progress
 262	 * or have finished their rcu_read_lock_sched() section.
 263	 */
 264	if (atomic_read(&files->count) > 1)
 265		synchronize_rcu();
 266
 267	spin_lock(&files->file_lock);
 268	if (IS_ERR(new_fdt))
 269		return PTR_ERR(new_fdt);
 270	cur_fdt = files_fdtable(files);
 271	BUG_ON(nr < cur_fdt->max_fds);
 272	copy_fdtable(new_fdt, cur_fdt);
 273	rcu_assign_pointer(files->fdt, new_fdt);
 274	if (cur_fdt != &files->fdtab)
 275		call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
 276	/* coupled with smp_rmb() in fd_install() */
 277	smp_wmb();
 278	return 0;
 279}
 280
 281/*
 282 * Expand files.
 283 * This function will expand the file structures, if the requested size exceeds
 284 * the current capacity and there is room for expansion.
 285 * Return <0 error code on error; 0 on success.
 286 * The files->file_lock should be held on entry, and will be held on exit.
 287 */
 288static int expand_files(struct files_struct *files, unsigned int nr)
 289	__releases(files->file_lock)
 290	__acquires(files->file_lock)
 291{
 292	struct fdtable *fdt;
 293	int error;
 294
 295repeat:
 296	fdt = files_fdtable(files);
 297
 298	/* Do we need to expand? */
 299	if (nr < fdt->max_fds)
 300		return 0;
 301
 302	if (unlikely(files->resize_in_progress)) {
 303		spin_unlock(&files->file_lock);
 304		wait_event(files->resize_wait, !files->resize_in_progress);
 305		spin_lock(&files->file_lock);
 306		goto repeat;
 307	}
 308
 309	/* Can we expand? */
 310	if (unlikely(nr >= sysctl_nr_open))
 311		return -EMFILE;
 312
 313	/* All good, so we try */
 314	files->resize_in_progress = true;
 315	error = expand_fdtable(files, nr);
 316	files->resize_in_progress = false;
 317
 318	wake_up_all(&files->resize_wait);
 319	return error;
 320}
 321
 322static inline void __set_close_on_exec(unsigned int fd, struct fdtable *fdt,
 323				       bool set)
 324{
 325	if (set) {
 326		__set_bit(fd, fdt->close_on_exec);
 327	} else {
 328		if (test_bit(fd, fdt->close_on_exec))
 329			__clear_bit(fd, fdt->close_on_exec);
 330	}
 331}
 332
 333static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt, bool set)
 334{
 335	__set_bit(fd, fdt->open_fds);
 336	__set_close_on_exec(fd, fdt, set);
 337	fd /= BITS_PER_LONG;
 338	if (!~fdt->open_fds[fd])
 339		__set_bit(fd, fdt->full_fds_bits);
 340}
 341
 342static inline void __clear_open_fd(unsigned int fd, struct fdtable *fdt)
 343{
 344	__clear_bit(fd, fdt->open_fds);
 345	fd /= BITS_PER_LONG;
 346	if (test_bit(fd, fdt->full_fds_bits))
 347		__clear_bit(fd, fdt->full_fds_bits);
 348}
 349
 350static inline bool fd_is_open(unsigned int fd, const struct fdtable *fdt)
 351{
 352	return test_bit(fd, fdt->open_fds);
 353}
 354
 355/*
 356 * Note that a sane fdtable size always has to be a multiple of
 357 * BITS_PER_LONG, since we have bitmaps that are sized by this.
 358 *
 359 * punch_hole is optional - when close_range() is asked to unshare
 360 * and close, we don't need to copy descriptors in that range, so
 361 * a smaller cloned descriptor table might suffice if the last
 362 * currently opened descriptor falls into that range.
 363 */
 364static unsigned int sane_fdtable_size(struct fdtable *fdt, struct fd_range *punch_hole)
 365{
 366	unsigned int last = find_last_bit(fdt->open_fds, fdt->max_fds);
 367
 368	if (last == fdt->max_fds)
 369		return NR_OPEN_DEFAULT;
 370	if (punch_hole && punch_hole->to >= last && punch_hole->from <= last) {
 371		last = find_last_bit(fdt->open_fds, punch_hole->from);
 372		if (last == punch_hole->from)
 373			return NR_OPEN_DEFAULT;
 374	}
 375	return ALIGN(last + 1, BITS_PER_LONG);
 376}
 377
 378/*
 379 * Allocate a new descriptor table and copy contents from the passed in
 380 * instance.  Returns a pointer to cloned table on success, ERR_PTR()
 381 * on failure.  For 'punch_hole' see sane_fdtable_size().
 382 */
 383struct files_struct *dup_fd(struct files_struct *oldf, struct fd_range *punch_hole)
 384{
 385	struct files_struct *newf;
 386	struct file **old_fds, **new_fds;
 387	unsigned int open_files, i;
 388	struct fdtable *old_fdt, *new_fdt;
 389
 390	newf = kmem_cache_alloc(files_cachep, GFP_KERNEL);
 391	if (!newf)
 392		return ERR_PTR(-ENOMEM);
 393
 394	atomic_set(&newf->count, 1);
 395
 396	spin_lock_init(&newf->file_lock);
 397	newf->resize_in_progress = false;
 398	init_waitqueue_head(&newf->resize_wait);
 399	newf->next_fd = 0;
 400	new_fdt = &newf->fdtab;
 401	new_fdt->max_fds = NR_OPEN_DEFAULT;
 402	new_fdt->close_on_exec = newf->close_on_exec_init;
 403	new_fdt->open_fds = newf->open_fds_init;
 404	new_fdt->full_fds_bits = newf->full_fds_bits_init;
 405	new_fdt->fd = &newf->fd_array[0];
 406
 407	spin_lock(&oldf->file_lock);
 408	old_fdt = files_fdtable(oldf);
 409	open_files = sane_fdtable_size(old_fdt, punch_hole);
 410
 411	/*
 412	 * Check whether we need to allocate a larger fd array and fd set.
 413	 */
 414	while (unlikely(open_files > new_fdt->max_fds)) {
 415		spin_unlock(&oldf->file_lock);
 416
 417		if (new_fdt != &newf->fdtab)
 418			__free_fdtable(new_fdt);
 419
 420		new_fdt = alloc_fdtable(open_files);
 421		if (IS_ERR(new_fdt)) {
 422			kmem_cache_free(files_cachep, newf);
 423			return ERR_CAST(new_fdt);
 424		}
 425
 426		/*
 427		 * Reacquire the oldf lock and a pointer to its fd table
 428		 * who knows it may have a new bigger fd table. We need
 429		 * the latest pointer.
 430		 */
 431		spin_lock(&oldf->file_lock);
 432		old_fdt = files_fdtable(oldf);
 433		open_files = sane_fdtable_size(old_fdt, punch_hole);
 434	}
 435
 436	copy_fd_bitmaps(new_fdt, old_fdt, open_files / BITS_PER_LONG);
 437
 438	old_fds = old_fdt->fd;
 439	new_fds = new_fdt->fd;
 440
 441	/*
 442	 * We may be racing against fd allocation from other threads using this
 443	 * files_struct, despite holding ->file_lock.
 444	 *
 445	 * alloc_fd() might have already claimed a slot, while fd_install()
 446	 * did not populate it yet. Note the latter operates locklessly, so
 447	 * the file can show up as we are walking the array below.
 448	 *
 449	 * At the same time we know no files will disappear as all other
 450	 * operations take the lock.
 451	 *
 452	 * Instead of trying to placate userspace racing with itself, we
 453	 * ref the file if we see it and mark the fd slot as unused otherwise.
 454	 */
 455	for (i = open_files; i != 0; i--) {
 456		struct file *f = rcu_dereference_raw(*old_fds++);
 457		if (f) {
 458			get_file(f);
 459		} else {
 460			__clear_open_fd(open_files - i, new_fdt);
 461		}
 462		rcu_assign_pointer(*new_fds++, f);
 463	}
 464	spin_unlock(&oldf->file_lock);
 465
 466	/* clear the remainder */
 467	memset(new_fds, 0, (new_fdt->max_fds - open_files) * sizeof(struct file *));
 468
 469	rcu_assign_pointer(newf->fdt, new_fdt);
 470
 471	return newf;
 472}
 473
 474static struct fdtable *close_files(struct files_struct * files)
 475{
 476	/*
 477	 * It is safe to dereference the fd table without RCU or
 478	 * ->file_lock because this is the last reference to the
 479	 * files structure.
 480	 */
 481	struct fdtable *fdt = rcu_dereference_raw(files->fdt);
 482	unsigned int i, j = 0;
 483
 484	for (;;) {
 485		unsigned long set;
 486		i = j * BITS_PER_LONG;
 487		if (i >= fdt->max_fds)
 488			break;
 489		set = fdt->open_fds[j++];
 490		while (set) {
 491			if (set & 1) {
 492				struct file *file = fdt->fd[i];
 493				if (file) {
 494					filp_close(file, files);
 495					cond_resched();
 496				}
 497			}
 498			i++;
 499			set >>= 1;
 500		}
 501	}
 502
 503	return fdt;
 504}
 505
 506void put_files_struct(struct files_struct *files)
 507{
 508	if (atomic_dec_and_test(&files->count)) {
 509		struct fdtable *fdt = close_files(files);
 510
 511		/* free the arrays if they are not embedded */
 512		if (fdt != &files->fdtab)
 513			__free_fdtable(fdt);
 514		kmem_cache_free(files_cachep, files);
 515	}
 516}
 517
 518void exit_files(struct task_struct *tsk)
 519{
 520	struct files_struct * files = tsk->files;
 521
 522	if (files) {
 523		task_lock(tsk);
 524		tsk->files = NULL;
 525		task_unlock(tsk);
 526		put_files_struct(files);
 527	}
 528}
 529
 530struct files_struct init_files = {
 531	.count		= ATOMIC_INIT(1),
 532	.fdt		= &init_files.fdtab,
 533	.fdtab		= {
 534		.max_fds	= NR_OPEN_DEFAULT,
 535		.fd		= &init_files.fd_array[0],
 536		.close_on_exec	= init_files.close_on_exec_init,
 537		.open_fds	= init_files.open_fds_init,
 538		.full_fds_bits	= init_files.full_fds_bits_init,
 539	},
 540	.file_lock	= __SPIN_LOCK_UNLOCKED(init_files.file_lock),
 541	.resize_wait	= __WAIT_QUEUE_HEAD_INITIALIZER(init_files.resize_wait),
 542};
 543
 544static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start)
 545{
 546	unsigned int maxfd = fdt->max_fds; /* always multiple of BITS_PER_LONG */
 547	unsigned int maxbit = maxfd / BITS_PER_LONG;
 548	unsigned int bitbit = start / BITS_PER_LONG;
 549	unsigned int bit;
 550
 551	/*
 552	 * Try to avoid looking at the second level bitmap
 553	 */
 554	bit = find_next_zero_bit(&fdt->open_fds[bitbit], BITS_PER_LONG,
 555				 start & (BITS_PER_LONG - 1));
 556	if (bit < BITS_PER_LONG)
 557		return bit + bitbit * BITS_PER_LONG;
 558
 559	bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG;
 560	if (bitbit >= maxfd)
 561		return maxfd;
 562	if (bitbit > start)
 563		start = bitbit;
 564	return find_next_zero_bit(fdt->open_fds, maxfd, start);
 565}
 566
 567/*
 568 * allocate a file descriptor, mark it busy.
 569 */
 570static int alloc_fd(unsigned start, unsigned end, unsigned flags)
 571{
 572	struct files_struct *files = current->files;
 573	unsigned int fd;
 574	int error;
 575	struct fdtable *fdt;
 576
 577	spin_lock(&files->file_lock);
 578repeat:
 579	fdt = files_fdtable(files);
 580	fd = start;
 581	if (fd < files->next_fd)
 582		fd = files->next_fd;
 583
 584	if (likely(fd < fdt->max_fds))
 585		fd = find_next_fd(fdt, fd);
 586
 587	/*
 588	 * N.B. For clone tasks sharing a files structure, this test
 589	 * will limit the total number of files that can be opened.
 590	 */
 591	error = -EMFILE;
 592	if (unlikely(fd >= end))
 593		goto out;
 594
 595	if (unlikely(fd >= fdt->max_fds)) {
 596		error = expand_files(files, fd);
 597		if (error < 0)
 598			goto out;
 599
 600		goto repeat;
 601	}
 602
 603	if (start <= files->next_fd)
 604		files->next_fd = fd + 1;
 605
 606	__set_open_fd(fd, fdt, flags & O_CLOEXEC);
 607	error = fd;
 608	VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL);
 609
 610out:
 611	spin_unlock(&files->file_lock);
 612	return error;
 613}
 614
 615int __get_unused_fd_flags(unsigned flags, unsigned long nofile)
 616{
 617	return alloc_fd(0, nofile, flags);
 618}
 619
 620int get_unused_fd_flags(unsigned flags)
 621{
 622	return __get_unused_fd_flags(flags, rlimit(RLIMIT_NOFILE));
 623}
 624EXPORT_SYMBOL(get_unused_fd_flags);
 625
 626static void __put_unused_fd(struct files_struct *files, unsigned int fd)
 627{
 628	struct fdtable *fdt = files_fdtable(files);
 629	__clear_open_fd(fd, fdt);
 630	if (fd < files->next_fd)
 631		files->next_fd = fd;
 632}
 633
 634void put_unused_fd(unsigned int fd)
 635{
 636	struct files_struct *files = current->files;
 637	spin_lock(&files->file_lock);
 638	__put_unused_fd(files, fd);
 639	spin_unlock(&files->file_lock);
 640}
 641
 642EXPORT_SYMBOL(put_unused_fd);
 643
 644/*
 645 * Install a file pointer in the fd array while it is being resized.
 646 *
 647 * We need to make sure our update to the array does not get lost as the resizing
 648 * thread can be copying the content as we modify it.
 649 *
 650 * We have two ways to do it:
 651 * - go off CPU waiting for resize_in_progress to clear
 652 * - take the spin lock
 653 *
 654 * The latter is trivial to implement and saves us from having to might_sleep()
 655 * for debugging purposes.
 656 *
 657 * This is moved out of line from fd_install() to convince gcc to optimize that
 658 * routine better.
 659 */
 660static void noinline fd_install_slowpath(unsigned int fd, struct file *file)
 661{
 662	struct files_struct *files = current->files;
 663	struct fdtable *fdt;
 664
 665	spin_lock(&files->file_lock);
 666	fdt = files_fdtable(files);
 667	VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL);
 668	rcu_assign_pointer(fdt->fd[fd], file);
 669	spin_unlock(&files->file_lock);
 670}
 671
 672/**
 673 * fd_install - install a file pointer in the fd array
 674 * @fd: file descriptor to install the file in
 675 * @file: the file to install
 676 *
 677 * This consumes the "file" refcount, so callers should treat it
 678 * as if they had called fput(file).
 679 */
 680void fd_install(unsigned int fd, struct file *file)
 681{
 682	struct files_struct *files = current->files;
 683	struct fdtable *fdt;
 684
 685	if (WARN_ON_ONCE(unlikely(file->f_mode & FMODE_BACKING)))
 686		return;
 687
 688	rcu_read_lock_sched();
 689	if (unlikely(files->resize_in_progress)) {
 690		rcu_read_unlock_sched();
 691		fd_install_slowpath(fd, file);
 692		return;
 693	}
 694	/* coupled with smp_wmb() in expand_fdtable() */
 695	smp_rmb();
 696	fdt = rcu_dereference_sched(files->fdt);
 697	VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL);
 698	rcu_assign_pointer(fdt->fd[fd], file);
 699	rcu_read_unlock_sched();
 700}
 701
 702EXPORT_SYMBOL(fd_install);
 703
 704/**
 705 * file_close_fd_locked - return file associated with fd
 706 * @files: file struct to retrieve file from
 707 * @fd: file descriptor to retrieve file for
 708 *
 709 * Doesn't take a separate reference count.
 710 *
 711 * Context: files_lock must be held.
 712 *
 713 * Returns: The file associated with @fd (NULL if @fd is not open)
 714 */
 715struct file *file_close_fd_locked(struct files_struct *files, unsigned fd)
 716{
 717	struct fdtable *fdt = files_fdtable(files);
 718	struct file *file;
 719
 720	lockdep_assert_held(&files->file_lock);
 721
 722	if (fd >= fdt->max_fds)
 723		return NULL;
 724
 725	fd = array_index_nospec(fd, fdt->max_fds);
 726	file = rcu_dereference_raw(fdt->fd[fd]);
 727	if (file) {
 728		rcu_assign_pointer(fdt->fd[fd], NULL);
 729		__put_unused_fd(files, fd);
 730	}
 731	return file;
 732}
 733
 734int close_fd(unsigned fd)
 735{
 736	struct files_struct *files = current->files;
 737	struct file *file;
 738
 739	spin_lock(&files->file_lock);
 740	file = file_close_fd_locked(files, fd);
 741	spin_unlock(&files->file_lock);
 742	if (!file)
 743		return -EBADF;
 744
 745	return filp_close(file, files);
 746}
 747EXPORT_SYMBOL(close_fd);
 748
 749/**
 750 * last_fd - return last valid index into fd table
 751 * @fdt: File descriptor table.
 752 *
 753 * Context: Either rcu read lock or files_lock must be held.
 754 *
 755 * Returns: Last valid index into fdtable.
 756 */
 757static inline unsigned last_fd(struct fdtable *fdt)
 758{
 759	return fdt->max_fds - 1;
 760}
 761
 762static inline void __range_cloexec(struct files_struct *cur_fds,
 763				   unsigned int fd, unsigned int max_fd)
 764{
 765	struct fdtable *fdt;
 766
 767	/* make sure we're using the correct maximum value */
 768	spin_lock(&cur_fds->file_lock);
 769	fdt = files_fdtable(cur_fds);
 770	max_fd = min(last_fd(fdt), max_fd);
 771	if (fd <= max_fd)
 772		bitmap_set(fdt->close_on_exec, fd, max_fd - fd + 1);
 773	spin_unlock(&cur_fds->file_lock);
 774}
 775
 776static inline void __range_close(struct files_struct *files, unsigned int fd,
 777				 unsigned int max_fd)
 778{
 779	struct file *file;
 780	unsigned n;
 781
 782	spin_lock(&files->file_lock);
 783	n = last_fd(files_fdtable(files));
 784	max_fd = min(max_fd, n);
 785
 786	for (; fd <= max_fd; fd++) {
 787		file = file_close_fd_locked(files, fd);
 788		if (file) {
 789			spin_unlock(&files->file_lock);
 790			filp_close(file, files);
 791			cond_resched();
 792			spin_lock(&files->file_lock);
 793		} else if (need_resched()) {
 794			spin_unlock(&files->file_lock);
 795			cond_resched();
 796			spin_lock(&files->file_lock);
 797		}
 798	}
 799	spin_unlock(&files->file_lock);
 800}
 801
 802/**
 803 * sys_close_range() - Close all file descriptors in a given range.
 804 *
 805 * @fd:     starting file descriptor to close
 806 * @max_fd: last file descriptor to close
 807 * @flags:  CLOSE_RANGE flags.
 808 *
 809 * This closes a range of file descriptors. All file descriptors
 810 * from @fd up to and including @max_fd are closed.
 811 * Currently, errors to close a given file descriptor are ignored.
 812 */
 813SYSCALL_DEFINE3(close_range, unsigned int, fd, unsigned int, max_fd,
 814		unsigned int, flags)
 815{
 816	struct task_struct *me = current;
 817	struct files_struct *cur_fds = me->files, *fds = NULL;
 818
 819	if (flags & ~(CLOSE_RANGE_UNSHARE | CLOSE_RANGE_CLOEXEC))
 820		return -EINVAL;
 821
 822	if (fd > max_fd)
 823		return -EINVAL;
 824
 825	if ((flags & CLOSE_RANGE_UNSHARE) && atomic_read(&cur_fds->count) > 1) {
 826		struct fd_range range = {fd, max_fd}, *punch_hole = &range;
 827
 828		/*
 829		 * If the caller requested all fds to be made cloexec we always
 830		 * copy all of the file descriptors since they still want to
 831		 * use them.
 832		 */
 833		if (flags & CLOSE_RANGE_CLOEXEC)
 834			punch_hole = NULL;
 835
 836		fds = dup_fd(cur_fds, punch_hole);
 837		if (IS_ERR(fds))
 838			return PTR_ERR(fds);
 839		/*
 840		 * We used to share our file descriptor table, and have now
 841		 * created a private one, make sure we're using it below.
 842		 */
 843		swap(cur_fds, fds);
 844	}
 845
 846	if (flags & CLOSE_RANGE_CLOEXEC)
 847		__range_cloexec(cur_fds, fd, max_fd);
 848	else
 849		__range_close(cur_fds, fd, max_fd);
 850
 851	if (fds) {
 852		/*
 853		 * We're done closing the files we were supposed to. Time to install
 854		 * the new file descriptor table and drop the old one.
 855		 */
 856		task_lock(me);
 857		me->files = cur_fds;
 858		task_unlock(me);
 859		put_files_struct(fds);
 860	}
 861
 862	return 0;
 863}
 864
 865/**
 866 * file_close_fd - return file associated with fd
 867 * @fd: file descriptor to retrieve file for
 868 *
 869 * Doesn't take a separate reference count.
 870 *
 871 * Returns: The file associated with @fd (NULL if @fd is not open)
 872 */
 873struct file *file_close_fd(unsigned int fd)
 874{
 875	struct files_struct *files = current->files;
 876	struct file *file;
 877
 878	spin_lock(&files->file_lock);
 879	file = file_close_fd_locked(files, fd);
 880	spin_unlock(&files->file_lock);
 881
 882	return file;
 883}
 884
 885void do_close_on_exec(struct files_struct *files)
 886{
 887	unsigned i;
 888	struct fdtable *fdt;
 889
 890	/* exec unshares first */
 891	spin_lock(&files->file_lock);
 892	for (i = 0; ; i++) {
 893		unsigned long set;
 894		unsigned fd = i * BITS_PER_LONG;
 895		fdt = files_fdtable(files);
 896		if (fd >= fdt->max_fds)
 897			break;
 898		set = fdt->close_on_exec[i];
 899		if (!set)
 900			continue;
 901		fdt->close_on_exec[i] = 0;
 902		for ( ; set ; fd++, set >>= 1) {
 903			struct file *file;
 904			if (!(set & 1))
 905				continue;
 906			file = fdt->fd[fd];
 907			if (!file)
 908				continue;
 909			rcu_assign_pointer(fdt->fd[fd], NULL);
 910			__put_unused_fd(files, fd);
 911			spin_unlock(&files->file_lock);
 912			filp_close(file, files);
 913			cond_resched();
 914			spin_lock(&files->file_lock);
 915		}
 916
 917	}
 918	spin_unlock(&files->file_lock);
 919}
 920
 921static struct file *__get_file_rcu(struct file __rcu **f)
 922{
 923	struct file __rcu *file;
 924	struct file __rcu *file_reloaded;
 925	struct file __rcu *file_reloaded_cmp;
 926
 927	file = rcu_dereference_raw(*f);
 928	if (!file)
 929		return NULL;
 930
 931	if (unlikely(!file_ref_get(&file->f_ref)))
 932		return ERR_PTR(-EAGAIN);
 933
 934	file_reloaded = rcu_dereference_raw(*f);
 935
 936	/*
 937	 * Ensure that all accesses have a dependency on the load from
 938	 * rcu_dereference_raw() above so we get correct ordering
 939	 * between reuse/allocation and the pointer check below.
 940	 */
 941	file_reloaded_cmp = file_reloaded;
 942	OPTIMIZER_HIDE_VAR(file_reloaded_cmp);
 943
 944	/*
 945	 * file_ref_get() above provided a full memory barrier when we
 946	 * acquired a reference.
 947	 *
 948	 * This is paired with the write barrier from assigning to the
 949	 * __rcu protected file pointer so that if that pointer still
 950	 * matches the current file, we know we have successfully
 951	 * acquired a reference to the right file.
 952	 *
 953	 * If the pointers don't match the file has been reallocated by
 954	 * SLAB_TYPESAFE_BY_RCU.
 955	 */
 956	if (file == file_reloaded_cmp)
 957		return file_reloaded;
 958
 959	fput(file);
 960	return ERR_PTR(-EAGAIN);
 961}
 962
 963/**
 964 * get_file_rcu - try go get a reference to a file under rcu
 965 * @f: the file to get a reference on
 966 *
 967 * This function tries to get a reference on @f carefully verifying that
 968 * @f hasn't been reused.
 969 *
 970 * This function should rarely have to be used and only by users who
 971 * understand the implications of SLAB_TYPESAFE_BY_RCU. Try to avoid it.
 972 *
 973 * Return: Returns @f with the reference count increased or NULL.
 974 */
 975struct file *get_file_rcu(struct file __rcu **f)
 976{
 977	for (;;) {
 978		struct file __rcu *file;
 979
 980		file = __get_file_rcu(f);
 981		if (!IS_ERR(file))
 982			return file;
 983	}
 984}
 985EXPORT_SYMBOL_GPL(get_file_rcu);
 986
 987/**
 988 * get_file_active - try go get a reference to a file
 989 * @f: the file to get a reference on
 990 *
 991 * In contast to get_file_rcu() the pointer itself isn't part of the
 992 * reference counting.
 993 *
 994 * This function should rarely have to be used and only by users who
 995 * understand the implications of SLAB_TYPESAFE_BY_RCU. Try to avoid it.
 996 *
 997 * Return: Returns @f with the reference count increased or NULL.
 998 */
 999struct file *get_file_active(struct file **f)
1000{
1001	struct file __rcu *file;
1002
1003	rcu_read_lock();
1004	file = __get_file_rcu(f);
1005	rcu_read_unlock();
1006	if (IS_ERR(file))
1007		file = NULL;
1008	return file;
1009}
1010EXPORT_SYMBOL_GPL(get_file_active);
1011
1012static inline struct file *__fget_files_rcu(struct files_struct *files,
1013       unsigned int fd, fmode_t mask)
1014{
1015	for (;;) {
1016		struct file *file;
1017		struct fdtable *fdt = rcu_dereference_raw(files->fdt);
1018		struct file __rcu **fdentry;
1019		unsigned long nospec_mask;
1020
1021		/* Mask is a 0 for invalid fd's, ~0 for valid ones */
1022		nospec_mask = array_index_mask_nospec(fd, fdt->max_fds);
1023
1024		/*
1025		 * fdentry points to the 'fd' offset, or fdt->fd[0].
1026		 * Loading from fdt->fd[0] is always safe, because the
1027		 * array always exists.
1028		 */
1029		fdentry = fdt->fd + (fd & nospec_mask);
1030
1031		/* Do the load, then mask any invalid result */
1032		file = rcu_dereference_raw(*fdentry);
1033		file = (void *)(nospec_mask & (unsigned long)file);
1034		if (unlikely(!file))
1035			return NULL;
1036
1037		/*
1038		 * Ok, we have a file pointer that was valid at
1039		 * some point, but it might have become stale since.
1040		 *
1041		 * We need to confirm it by incrementing the refcount
1042		 * and then check the lookup again.
1043		 *
1044		 * file_ref_get() gives us a full memory barrier. We
1045		 * only really need an 'acquire' one to protect the
1046		 * loads below, but we don't have that.
1047		 */
1048		if (unlikely(!file_ref_get(&file->f_ref)))
1049			continue;
1050
1051		/*
1052		 * Such a race can take two forms:
1053		 *
1054		 *  (a) the file ref already went down to zero and the
1055		 *      file hasn't been reused yet or the file count
1056		 *      isn't zero but the file has already been reused.
1057		 *
1058		 *  (b) the file table entry has changed under us.
1059		 *       Note that we don't need to re-check the 'fdt->fd'
1060		 *       pointer having changed, because it always goes
1061		 *       hand-in-hand with 'fdt'.
1062		 *
1063		 * If so, we need to put our ref and try again.
1064		 */
1065		if (unlikely(file != rcu_dereference_raw(*fdentry)) ||
1066		    unlikely(rcu_dereference_raw(files->fdt) != fdt)) {
1067			fput(file);
1068			continue;
1069		}
1070
1071		/*
1072		 * This isn't the file we're looking for or we're not
1073		 * allowed to get a reference to it.
1074		 */
1075		if (unlikely(file->f_mode & mask)) {
1076			fput(file);
1077			return NULL;
1078		}
1079
1080		/*
1081		 * Ok, we have a ref to the file, and checked that it
1082		 * still exists.
1083		 */
1084		return file;
1085	}
1086}
1087
1088static struct file *__fget_files(struct files_struct *files, unsigned int fd,
1089				 fmode_t mask)
1090{
1091	struct file *file;
1092
1093	rcu_read_lock();
1094	file = __fget_files_rcu(files, fd, mask);
1095	rcu_read_unlock();
1096
1097	return file;
1098}
1099
1100static inline struct file *__fget(unsigned int fd, fmode_t mask)
1101{
1102	return __fget_files(current->files, fd, mask);
1103}
1104
1105struct file *fget(unsigned int fd)
1106{
1107	return __fget(fd, FMODE_PATH);
1108}
1109EXPORT_SYMBOL(fget);
1110
1111struct file *fget_raw(unsigned int fd)
1112{
1113	return __fget(fd, 0);
1114}
1115EXPORT_SYMBOL(fget_raw);
1116
1117struct file *fget_task(struct task_struct *task, unsigned int fd)
1118{
1119	struct file *file = NULL;
1120
1121	task_lock(task);
1122	if (task->files)
1123		file = __fget_files(task->files, fd, 0);
1124	task_unlock(task);
1125
1126	return file;
1127}
1128
1129struct file *fget_task_next(struct task_struct *task, unsigned int *ret_fd)
1130{
1131	/* Must be called with rcu_read_lock held */
1132	struct files_struct *files;
1133	unsigned int fd = *ret_fd;
1134	struct file *file = NULL;
1135
1136	task_lock(task);
1137	files = task->files;
1138	if (files) {
1139		rcu_read_lock();
1140		for (; fd < files_fdtable(files)->max_fds; fd++) {
1141			file = __fget_files_rcu(files, fd, 0);
1142			if (file)
1143				break;
1144		}
1145		rcu_read_unlock();
1146	}
1147	task_unlock(task);
1148	*ret_fd = fd;
1149	return file;
1150}
1151EXPORT_SYMBOL(fget_task_next);
1152
1153/*
1154 * Lightweight file lookup - no refcnt increment if fd table isn't shared.
1155 *
1156 * You can use this instead of fget if you satisfy all of the following
1157 * conditions:
1158 * 1) You must call fput_light before exiting the syscall and returning control
1159 *    to userspace (i.e. you cannot remember the returned struct file * after
1160 *    returning to userspace).
1161 * 2) You must not call filp_close on the returned struct file * in between
1162 *    calls to fget_light and fput_light.
1163 * 3) You must not clone the current task in between the calls to fget_light
1164 *    and fput_light.
1165 *
1166 * The fput_needed flag returned by fget_light should be passed to the
1167 * corresponding fput_light.
1168 *
1169 * (As an exception to rule 2, you can call filp_close between fget_light and
1170 * fput_light provided that you capture a real refcount with get_file before
1171 * the call to filp_close, and ensure that this real refcount is fput *after*
1172 * the fput_light call.)
1173 *
1174 * See also the documentation in rust/kernel/file.rs.
1175 */
1176static inline struct fd __fget_light(unsigned int fd, fmode_t mask)
1177{
1178	struct files_struct *files = current->files;
1179	struct file *file;
1180
1181	/*
1182	 * If another thread is concurrently calling close_fd() followed
1183	 * by put_files_struct(), we must not observe the old table
1184	 * entry combined with the new refcount - otherwise we could
1185	 * return a file that is concurrently being freed.
1186	 *
1187	 * atomic_read_acquire() pairs with atomic_dec_and_test() in
1188	 * put_files_struct().
1189	 */
1190	if (likely(atomic_read_acquire(&files->count) == 1)) {
1191		file = files_lookup_fd_raw(files, fd);
1192		if (!file || unlikely(file->f_mode & mask))
1193			return EMPTY_FD;
1194		return BORROWED_FD(file);
1195	} else {
1196		file = __fget_files(files, fd, mask);
1197		if (!file)
1198			return EMPTY_FD;
1199		return CLONED_FD(file);
1200	}
1201}
1202struct fd fdget(unsigned int fd)
1203{
1204	return __fget_light(fd, FMODE_PATH);
1205}
1206EXPORT_SYMBOL(fdget);
1207
1208struct fd fdget_raw(unsigned int fd)
1209{
1210	return __fget_light(fd, 0);
1211}
1212
1213/*
1214 * Try to avoid f_pos locking. We only need it if the
1215 * file is marked for FMODE_ATOMIC_POS, and it can be
1216 * accessed multiple ways.
1217 *
1218 * Always do it for directories, because pidfd_getfd()
1219 * can make a file accessible even if it otherwise would
1220 * not be, and for directories this is a correctness
1221 * issue, not a "POSIX requirement".
1222 */
1223static inline bool file_needs_f_pos_lock(struct file *file)
1224{
1225	if (!(file->f_mode & FMODE_ATOMIC_POS))
1226		return false;
1227	if (__file_ref_read_raw(&file->f_ref) != FILE_REF_ONEREF)
1228		return true;
1229	if (file->f_op->iterate_shared)
1230		return true;
1231	return false;
1232}
1233
1234bool file_seek_cur_needs_f_lock(struct file *file)
1235{
1236	if (!(file->f_mode & FMODE_ATOMIC_POS) && !file->f_op->iterate_shared)
1237		return false;
1238
1239	/*
1240	 * Note that we are not guaranteed to be called after fdget_pos() on
1241	 * this file obj, in which case the caller is expected to provide the
1242	 * appropriate locking.
1243	 */
1244
1245	return true;
1246}
1247
1248struct fd fdget_pos(unsigned int fd)
1249{
1250	struct fd f = fdget(fd);
1251	struct file *file = fd_file(f);
1252
1253	if (likely(file) && file_needs_f_pos_lock(file)) {
1254		f.word |= FDPUT_POS_UNLOCK;
1255		mutex_lock(&file->f_pos_lock);
1256	}
1257	return f;
1258}
1259
1260void __f_unlock_pos(struct file *f)
1261{
1262	mutex_unlock(&f->f_pos_lock);
1263}
1264
1265/*
1266 * We only lock f_pos if we have threads or if the file might be
1267 * shared with another process. In both cases we'll have an elevated
1268 * file count (done either by fdget() or by fork()).
1269 */
1270
1271void set_close_on_exec(unsigned int fd, int flag)
1272{
1273	struct files_struct *files = current->files;
1274	spin_lock(&files->file_lock);
1275	__set_close_on_exec(fd, files_fdtable(files), flag);
1276	spin_unlock(&files->file_lock);
1277}
1278
1279bool get_close_on_exec(unsigned int fd)
1280{
1281	bool res;
1282	rcu_read_lock();
1283	res = close_on_exec(fd, current->files);
1284	rcu_read_unlock();
1285	return res;
1286}
1287
1288static int do_dup2(struct files_struct *files,
1289	struct file *file, unsigned fd, unsigned flags)
1290__releases(&files->file_lock)
1291{
1292	struct file *tofree;
1293	struct fdtable *fdt;
1294
1295	/*
1296	 * dup2() is expected to close the file installed in the target fd slot
1297	 * (if any). However, userspace hand-picking a fd may be racing against
1298	 * its own threads which happened to allocate it in open() et al but did
1299	 * not populate it yet.
1300	 *
1301	 * Broadly speaking we may be racing against the following:
1302	 * fd = get_unused_fd_flags();     // fd slot reserved, ->fd[fd] == NULL
1303	 * file = hard_work_goes_here();
1304	 * fd_install(fd, file);           // only now ->fd[fd] == file
1305	 *
1306	 * It is an invariant that a successfully allocated fd has a NULL entry
1307	 * in the array until the matching fd_install().
1308	 *
1309	 * If we fit the window, we have the fd to populate, yet no target file
1310	 * to close. Trying to ignore it and install our new file would violate
1311	 * the invariant and make fd_install() overwrite our file.
1312	 *
1313	 * Things can be done(tm) to handle this. However, the issue does not
1314	 * concern legitimate programs and we only need to make sure the kernel
1315	 * does not trip over it.
1316	 *
1317	 * The simplest way out is to return an error if we find ourselves here.
1318	 *
1319	 * POSIX is silent on the issue, we return -EBUSY.
1320	 */
1321	fdt = files_fdtable(files);
1322	fd = array_index_nospec(fd, fdt->max_fds);
1323	tofree = rcu_dereference_raw(fdt->fd[fd]);
1324	if (!tofree && fd_is_open(fd, fdt))
1325		goto Ebusy;
1326	get_file(file);
1327	rcu_assign_pointer(fdt->fd[fd], file);
1328	__set_open_fd(fd, fdt, flags & O_CLOEXEC);
1329	spin_unlock(&files->file_lock);
1330
1331	if (tofree)
1332		filp_close(tofree, files);
1333
1334	return fd;
1335
1336Ebusy:
1337	spin_unlock(&files->file_lock);
1338	return -EBUSY;
1339}
1340
1341int replace_fd(unsigned fd, struct file *file, unsigned flags)
1342{
1343	int err;
1344	struct files_struct *files = current->files;
1345
1346	if (!file)
1347		return close_fd(fd);
1348
1349	if (fd >= rlimit(RLIMIT_NOFILE))
1350		return -EBADF;
1351
1352	spin_lock(&files->file_lock);
1353	err = expand_files(files, fd);
1354	if (unlikely(err < 0))
1355		goto out_unlock;
1356	err = do_dup2(files, file, fd, flags);
1357	if (err < 0)
1358		return err;
1359	return 0;
1360
1361out_unlock:
1362	spin_unlock(&files->file_lock);
1363	return err;
1364}
1365
1366/**
1367 * receive_fd() - Install received file into file descriptor table
1368 * @file: struct file that was received from another process
1369 * @ufd: __user pointer to write new fd number to
1370 * @o_flags: the O_* flags to apply to the new fd entry
1371 *
1372 * Installs a received file into the file descriptor table, with appropriate
1373 * checks and count updates. Optionally writes the fd number to userspace, if
1374 * @ufd is non-NULL.
1375 *
1376 * This helper handles its own reference counting of the incoming
1377 * struct file.
1378 *
1379 * Returns newly install fd or -ve on error.
1380 */
1381int receive_fd(struct file *file, int __user *ufd, unsigned int o_flags)
1382{
1383	int error;
1384
1385	error = security_file_receive(file);
1386	if (error)
1387		return error;
1388
1389	FD_PREPARE(fdf, o_flags, file);
1390	if (fdf.err)
1391		return fdf.err;
1392	get_file(file);
1393
1394	if (ufd) {
1395		error = put_user(fd_prepare_fd(fdf), ufd);
1396		if (error)
1397			return error;
1398	}
1399
1400	__receive_sock(fd_prepare_file(fdf));
1401	return fd_publish(fdf);
1402}
1403EXPORT_SYMBOL_GPL(receive_fd);
1404
1405int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags)
1406{
1407	int error;
1408
1409	error = security_file_receive(file);
1410	if (error)
1411		return error;
1412	error = replace_fd(new_fd, file, o_flags);
1413	if (error)
1414		return error;
1415	__receive_sock(file);
1416	return new_fd;
1417}
1418
1419static int ksys_dup3(unsigned int oldfd, unsigned int newfd, int flags)
1420{
1421	int err = -EBADF;
1422	struct file *file;
1423	struct files_struct *files = current->files;
1424
1425	if ((flags & ~O_CLOEXEC) != 0)
1426		return -EINVAL;
1427
1428	if (unlikely(oldfd == newfd))
1429		return -EINVAL;
1430
1431	if (newfd >= rlimit(RLIMIT_NOFILE))
1432		return -EBADF;
1433
1434	spin_lock(&files->file_lock);
1435	err = expand_files(files, newfd);
1436	file = files_lookup_fd_locked(files, oldfd);
1437	if (unlikely(!file))
1438		goto Ebadf;
1439	if (unlikely(err < 0)) {
1440		if (err == -EMFILE)
1441			goto Ebadf;
1442		goto out_unlock;
1443	}
1444	return do_dup2(files, file, newfd, flags);
1445
1446Ebadf:
1447	err = -EBADF;
1448out_unlock:
1449	spin_unlock(&files->file_lock);
1450	return err;
1451}
1452
1453SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags)
1454{
1455	return ksys_dup3(oldfd, newfd, flags);
1456}
1457
1458SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd)
1459{
1460	if (unlikely(newfd == oldfd)) { /* corner case */
1461		struct files_struct *files = current->files;
1462		struct file *f;
1463		int retval = oldfd;
1464
1465		rcu_read_lock();
1466		f = __fget_files_rcu(files, oldfd, 0);
1467		if (!f)
1468			retval = -EBADF;
1469		rcu_read_unlock();
1470		if (f)
1471			fput(f);
1472		return retval;
1473	}
1474	return ksys_dup3(oldfd, newfd, 0);
1475}
1476
1477SYSCALL_DEFINE1(dup, unsigned int, fildes)
1478{
1479	int ret = -EBADF;
1480	struct file *file = fget_raw(fildes);
1481
1482	if (file) {
1483		ret = get_unused_fd_flags(0);
1484		if (ret >= 0)
1485			fd_install(ret, file);
1486		else
1487			fput(file);
1488	}
1489	return ret;
1490}
1491
1492int f_dupfd(unsigned int from, struct file *file, unsigned flags)
1493{
1494	unsigned long nofile = rlimit(RLIMIT_NOFILE);
1495	int err;
1496	if (from >= nofile)
1497		return -EINVAL;
1498	err = alloc_fd(from, nofile, flags);
1499	if (err >= 0) {
1500		get_file(file);
1501		fd_install(err, file);
1502	}
1503	return err;
1504}
1505
1506int iterate_fd(struct files_struct *files, unsigned n,
1507		int (*f)(const void *, struct file *, unsigned),
1508		const void *p)
1509{
1510	struct fdtable *fdt;
1511	int res = 0;
1512	if (!files)
1513		return 0;
1514	spin_lock(&files->file_lock);
1515	for (fdt = files_fdtable(files); n < fdt->max_fds; n++) {
1516		struct file *file;
1517		file = rcu_dereference_check_fdtable(files, fdt->fd[n]);
1518		if (!file)
1519			continue;
1520		res = f(p, file, n);
1521		if (res)
1522			break;
1523	}
1524	spin_unlock(&files->file_lock);
1525	return res;
1526}
1527EXPORT_SYMBOL(iterate_fd);