Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'exec-for-v5.11' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace

Pull execve updates from Eric Biederman:
"This set of changes ultimately fixes the interaction of posix file
lock and exec. Fundamentally most of the change is just moving where
unshare_files is called during exec, and tweaking the users of
files_struct so that the count of files_struct is not unnecessarily
played with.

Along the way fcheck and related helpers were renamed to more
accurately reflect what they do.

There were also many other small changes that fell out, as this is the
first time in a long time much of this code has been touched.

Benchmarks haven't turned up any practical issues but Al Viro has
observed a possibility for a lot of pounding on task_lock. So I have
some changes in progress to convert put_files_struct to always rcu
free files_struct. That wasn't ready for the merge window so that will
have to wait until next time"

* 'exec-for-v5.11' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace: (27 commits)
exec: Move io_uring_task_cancel after the point of no return
coredump: Document coredump code exclusively used by cell spufs
file: Remove get_files_struct
file: Rename __close_fd_get_file close_fd_get_file
file: Replace ksys_close with close_fd
file: Rename __close_fd to close_fd and remove the files parameter
file: Merge __alloc_fd into alloc_fd
file: In f_dupfd read RLIMIT_NOFILE once.
file: Merge __fd_install into fd_install
proc/fd: In fdinfo seq_show don't use get_files_struct
bpf/task_iter: In task_file_seq_get_next use task_lookup_next_fd_rcu
proc/fd: In proc_readfd_common use task_lookup_next_fd_rcu
file: Implement task_lookup_next_fd_rcu
kcmp: In get_file_raw_ptr use task_lookup_fd_rcu
proc/fd: In tid_fd_mode use task_lookup_fd_rcu
file: Implement task_lookup_fd_rcu
file: Rename fcheck lookup_fd_rcu
file: Replace fcheck_files with files_lookup_fd_rcu
file: Factor files_lookup_fd_locked out of fcheck_files
file: Rename __fcheck_files to files_lookup_fd_raw
...

+158 -244
+4 -4
Documentation/filesystems/files.rst
··· 62 62 be held. 63 63 64 64 4. To look up the file structure given an fd, a reader 65 - must use either fcheck() or fcheck_files() APIs. These 65 + must use either lookup_fd_rcu() or files_lookup_fd_rcu() APIs. These 66 66 take care of barrier requirements due to lock-free lookup. 67 67 68 68 An example:: ··· 70 70 struct file *file; 71 71 72 72 rcu_read_lock(); 73 - file = fcheck(fd); 73 + file = lookup_fd_rcu(fd); 74 74 if (file) { 75 75 ... 76 76 } ··· 84 84 on ->f_count:: 85 85 86 86 rcu_read_lock(); 87 - file = fcheck_files(files, fd); 87 + file = files_lookup_fd_rcu(files, fd); 88 88 if (file) { 89 89 if (atomic_long_inc_not_zero(&file->f_count)) 90 90 *fput_needed = 1; ··· 104 104 lock-free, they must be installed using rcu_assign_pointer() 105 105 API. If they are looked up lock-free, rcu_dereference() 106 106 must be used. However it is advisable to use files_fdtable() 107 - and fcheck()/fcheck_files() which take care of these issues. 107 + and lookup_fd_rcu()/files_lookup_fd_rcu() which take care of these issues. 108 108 109 109 7. While updating, the fdtable pointer must be looked up while 110 110 holding files->file_lock. If ->file_lock is dropped, then
+1 -1
arch/powerpc/platforms/cell/spufs/coredump.c
··· 74 74 *fd = n - 1; 75 75 76 76 rcu_read_lock(); 77 - file = fcheck(*fd); 77 + file = lookup_fd_rcu(*fd); 78 78 ctx = SPUFS_I(file_inode(file))->i_ctx; 79 79 get_spu_context(ctx); 80 80 rcu_read_unlock();
+1 -1
drivers/android/binder.c
··· 1836 1836 if (!twcb) 1837 1837 return; 1838 1838 init_task_work(&twcb->twork, binder_do_fd_close); 1839 - __close_fd_get_file(fd, &twcb->file); 1839 + close_fd_get_file(fd, &twcb->file); 1840 1840 if (twcb->file) { 1841 1841 filp_close(twcb->file, current->files); 1842 1842 task_work_add(current, &twcb->twork, TWA_RESUME);
+3 -2
fs/autofs/dev-ioctl.c
··· 4 4 * Copyright 2008 Ian Kent <raven@themaw.net> 5 5 */ 6 6 7 + #include <linux/module.h> 7 8 #include <linux/miscdevice.h> 8 9 #include <linux/compat.h> 9 - #include <linux/syscalls.h> 10 + #include <linux/fdtable.h> 10 11 #include <linux/magic.h> 11 12 #include <linux/nospec.h> 12 13 ··· 290 289 struct autofs_sb_info *sbi, 291 290 struct autofs_dev_ioctl *param) 292 291 { 293 - return ksys_close(param->ioctlfd); 292 + return close_fd(param->ioctlfd); 294 293 } 295 294 296 295 /*
+2
fs/binfmt_elf.c
··· 2198 2198 { 2199 2199 size_t sz = get_note_info_size(&info); 2200 2200 2201 + /* For cell spufs */ 2201 2202 sz += elf_coredump_extra_notes_size(); 2202 2203 2203 2204 phdr4note = kmalloc(sizeof(*phdr4note), GFP_KERNEL); ··· 2262 2261 if (!write_note_info(&info, cprm)) 2263 2262 goto end_coredump; 2264 2263 2264 + /* For cell spufs */ 2265 2265 if (elf_coredump_extra_notes_write(cprm)) 2266 2266 goto end_coredump; 2267 2267
+2 -4
fs/coredump.c
··· 586 586 int ispipe; 587 587 size_t *argv = NULL; 588 588 int argc = 0; 589 - struct files_struct *displaced; 590 589 /* require nonrelative corefile path and be extra careful */ 591 590 bool need_suid_safe = false; 592 591 bool core_dumped = false; ··· 791 792 } 792 793 793 794 /* get us an unshared descriptor table; almost always a no-op */ 794 - retval = unshare_files(&displaced); 795 + /* The cell spufs coredump code reads the file descriptor tables */ 796 + retval = unshare_files(); 795 797 if (retval) 796 798 goto close_fail; 797 - if (displaced) 798 - put_files_struct(displaced); 799 799 if (!dump_interrupted()) { 800 800 /* 801 801 * umh disabled with CONFIG_STATIC_USERMODEHELPER_PATH="" would
+18 -21
fs/exec.c
··· 1259 1259 goto out; 1260 1260 1261 1261 /* 1262 + * Cancel any io_uring activity across execve 1263 + */ 1264 + io_uring_task_cancel(); 1265 + 1266 + /* Ensure the files table is not shared. */ 1267 + retval = unshare_files(); 1268 + if (retval) 1269 + goto out; 1270 + 1271 + /* 1262 1272 * Must be called _before_ exec_mmap() as bprm->mm is 1263 1273 * not visibile until then. This also enables the update 1264 1274 * to be lockless. ··· 1789 1779 int fd, struct filename *filename, int flags) 1790 1780 { 1791 1781 struct file *file; 1792 - struct files_struct *displaced; 1793 1782 int retval; 1794 - 1795 - /* 1796 - * Cancel any io_uring activity across execve 1797 - */ 1798 - io_uring_task_cancel(); 1799 - 1800 - retval = unshare_files(&displaced); 1801 - if (retval) 1802 - return retval; 1803 1783 1804 1784 retval = prepare_bprm_creds(bprm); 1805 1785 if (retval) 1806 - goto out_files; 1786 + return retval; 1807 1787 1808 1788 check_unsafe_exec(bprm); 1809 1789 current->in_execve = 1; ··· 1808 1808 bprm->file = file; 1809 1809 /* 1810 1810 * Record that a name derived from an O_CLOEXEC fd will be 1811 - * inaccessible after exec. Relies on having exclusive access to 1812 - * current->files (due to unshare_files above). 1811 + * inaccessible after exec. This allows the code in exec to 1812 + * choose to fail when the executable is not mmaped into the 1813 + * interpreter and an open file descriptor is not passed to 1814 + * the interpreter. This makes for a better user experience 1815 + * than having the interpreter start and then immediately fail 1816 + * when it finds the executable is inaccessible. 1813 1817 */ 1814 - if (bprm->fdpath && 1815 - close_on_exec(fd, rcu_dereference_raw(current->files->fdt))) 1818 + if (bprm->fdpath && get_close_on_exec(fd)) 1816 1819 bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE; 1817 1820 1818 1821 /* Set the unchanging part of bprm->cred */ ··· 1833 1830 rseq_execve(current); 1834 1831 acct_update_integrals(current); 1835 1832 task_numa_free(current, false); 1836 - if (displaced) 1837 - put_files_struct(displaced); 1838 1833 return retval; 1839 1834 1840 1835 out: ··· 1848 1847 out_unmark: 1849 1848 current->fs->in_exec = 0; 1850 1849 current->in_execve = 0; 1851 - 1852 - out_files: 1853 - if (displaced) 1854 - reset_files_struct(displaced); 1855 1850 1856 1851 return retval; 1857 1852 }
+58 -66
fs/file.c
··· 158 158 spin_unlock(&files->file_lock); 159 159 new_fdt = alloc_fdtable(nr); 160 160 161 - /* make sure all __fd_install() have seen resize_in_progress 161 + /* make sure all fd_install() have seen resize_in_progress 162 162 * or have finished their rcu_read_lock_sched() section. 163 163 */ 164 164 if (atomic_read(&files->count) > 1) ··· 181 181 rcu_assign_pointer(files->fdt, new_fdt); 182 182 if (cur_fdt != &files->fdtab) 183 183 call_rcu(&cur_fdt->rcu, free_fdtable_rcu); 184 - /* coupled with smp_rmb() in __fd_install() */ 184 + /* coupled with smp_rmb() in fd_install() */ 185 185 smp_wmb(); 186 186 return 1; 187 187 } ··· 411 411 return fdt; 412 412 } 413 413 414 - struct files_struct *get_files_struct(struct task_struct *task) 415 - { 416 - struct files_struct *files; 417 - 418 - task_lock(task); 419 - files = task->files; 420 - if (files) 421 - atomic_inc(&files->count); 422 - task_unlock(task); 423 - 424 - return files; 425 - } 426 - 427 414 void put_files_struct(struct files_struct *files) 428 415 { 429 416 if (atomic_dec_and_test(&files->count)) { ··· 421 434 __free_fdtable(fdt); 422 435 kmem_cache_free(files_cachep, files); 423 436 } 424 - } 425 - 426 - void reset_files_struct(struct files_struct *files) 427 - { 428 - struct task_struct *tsk = current; 429 - struct files_struct *old; 430 - 431 - old = tsk->files; 432 - task_lock(tsk); 433 - tsk->files = files; 434 - task_unlock(tsk); 435 - put_files_struct(old); 436 437 } 437 438 438 439 void exit_files(struct task_struct *tsk) ··· 467 492 /* 468 493 * allocate a file descriptor, mark it busy. 469 494 */ 470 - int __alloc_fd(struct files_struct *files, 471 - unsigned start, unsigned end, unsigned flags) 495 + static int alloc_fd(unsigned start, unsigned end, unsigned flags) 472 496 { 497 + struct files_struct *files = current->files; 473 498 unsigned int fd; 474 499 int error; 475 500 struct fdtable *fdt; ··· 525 550 return error; 526 551 } 527 552 528 - static int alloc_fd(unsigned start, unsigned flags) 529 - { 530 - return __alloc_fd(current->files, start, rlimit(RLIMIT_NOFILE), flags); 531 - } 532 - 533 553 int __get_unused_fd_flags(unsigned flags, unsigned long nofile) 534 554 { 535 - return __alloc_fd(current->files, 0, nofile, flags); 555 + return alloc_fd(0, nofile, flags); 536 556 } 537 557 538 558 int get_unused_fd_flags(unsigned flags) ··· 566 596 * It should never happen - if we allow dup2() do it, _really_ bad things 567 597 * will follow. 568 598 * 569 - * NOTE: __fd_install() variant is really, really low-level; don't 570 - * use it unless you are forced to by truly lousy API shoved down 571 - * your throat. 'files' *MUST* be either current->files or obtained 572 - * by get_files_struct(current) done by whoever had given it to you, 573 - * or really bad things will happen. Normally you want to use 574 - * fd_install() instead. 599 + * This consumes the "file" refcount, so callers should treat it 600 + * as if they had called fput(file). 575 601 */ 576 602 577 - void __fd_install(struct files_struct *files, unsigned int fd, 578 - struct file *file) 603 + void fd_install(unsigned int fd, struct file *file) 579 604 { 605 + struct files_struct *files = current->files; 580 606 struct fdtable *fdt; 581 607 582 608 rcu_read_lock_sched(); ··· 592 626 BUG_ON(fdt->fd[fd] != NULL); 593 627 rcu_assign_pointer(fdt->fd[fd], file); 594 628 rcu_read_unlock_sched(); 595 - } 596 - 597 - /* 598 - * This consumes the "file" refcount, so callers should treat it 599 - * as if they had called fput(file). 600 - */ 601 - void fd_install(unsigned int fd, struct file *file) 602 - { 603 - __fd_install(current->files, fd, file); 604 629 } 605 630 606 631 EXPORT_SYMBOL(fd_install); ··· 616 659 return file; 617 660 } 618 661 619 - /* 620 - * The same warnings as for __alloc_fd()/__fd_install() apply here... 621 - */ 622 - int __close_fd(struct files_struct *files, unsigned fd) 662 + int close_fd(unsigned fd) 623 663 { 664 + struct files_struct *files = current->files; 624 665 struct file *file; 625 666 626 667 file = pick_file(files, fd); ··· 627 672 628 673 return filp_close(file, files); 629 674 } 630 - EXPORT_SYMBOL(__close_fd); /* for ksys_close() */ 675 + EXPORT_SYMBOL(close_fd); /* for ksys_close() */ 631 676 632 677 static inline void __range_cloexec(struct files_struct *cur_fds, 633 678 unsigned int fd, unsigned int max_fd) ··· 732 777 } 733 778 734 779 /* 735 - * variant of __close_fd that gets a ref on the file for later fput. 780 + * variant of close_fd that gets a ref on the file for later fput. 736 781 * The caller must ensure that filp_close() called on the file, and then 737 782 * an fput(). 738 783 */ 739 - int __close_fd_get_file(unsigned int fd, struct file **res) 784 + int close_fd_get_file(unsigned int fd, struct file **res) 740 785 { 741 786 struct files_struct *files = current->files; 742 787 struct file *file; ··· 805 850 806 851 rcu_read_lock(); 807 852 loop: 808 - file = fcheck_files(files, fd); 853 + file = files_lookup_fd_rcu(files, fd); 809 854 if (file) { 810 855 /* File object ref couldn't be taken. 811 856 * dup2() atomicity guarantee is the reason ··· 856 901 return file; 857 902 } 858 903 904 + struct file *task_lookup_fd_rcu(struct task_struct *task, unsigned int fd) 905 + { 906 + /* Must be called with rcu_read_lock held */ 907 + struct files_struct *files; 908 + struct file *file = NULL; 909 + 910 + task_lock(task); 911 + files = task->files; 912 + if (files) 913 + file = files_lookup_fd_rcu(files, fd); 914 + task_unlock(task); 915 + 916 + return file; 917 + } 918 + 919 + struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *ret_fd) 920 + { 921 + /* Must be called with rcu_read_lock held */ 922 + struct files_struct *files; 923 + unsigned int fd = *ret_fd; 924 + struct file *file = NULL; 925 + 926 + task_lock(task); 927 + files = task->files; 928 + if (files) { 929 + for (; fd < files_fdtable(files)->max_fds; fd++) { 930 + file = files_lookup_fd_rcu(files, fd); 931 + if (file) 932 + break; 933 + } 934 + } 935 + task_unlock(task); 936 + *ret_fd = fd; 937 + return file; 938 + } 939 + 859 940 /* 860 941 * Lightweight file lookup - no refcnt increment if fd table isn't shared. 861 942 * ··· 914 923 struct file *file; 915 924 916 925 if (atomic_read(&files->count) == 1) { 917 - file = __fcheck_files(files, fd); 926 + file = files_lookup_fd_raw(files, fd); 918 927 if (!file || unlikely(file->f_mode & mask)) 919 928 return 0; 920 929 return (unsigned long)file; ··· 1036 1045 struct files_struct *files = current->files; 1037 1046 1038 1047 if (!file) 1039 - return __close_fd(files, fd); 1048 + return close_fd(fd); 1040 1049 1041 1050 if (fd >= rlimit(RLIMIT_NOFILE)) 1042 1051 return -EBADF; ··· 1125 1134 1126 1135 spin_lock(&files->file_lock); 1127 1136 err = expand_files(files, newfd); 1128 - file = fcheck(oldfd); 1137 + file = files_lookup_fd_locked(files, oldfd); 1129 1138 if (unlikely(!file)) 1130 1139 goto Ebadf; 1131 1140 if (unlikely(err < 0)) { ··· 1154 1163 int retval = oldfd; 1155 1164 1156 1165 rcu_read_lock(); 1157 - if (!fcheck_files(files, oldfd)) 1166 + if (!files_lookup_fd_rcu(files, oldfd)) 1158 1167 retval = -EBADF; 1159 1168 rcu_read_unlock(); 1160 1169 return retval; ··· 1179 1188 1180 1189 int f_dupfd(unsigned int from, struct file *file, unsigned flags) 1181 1190 { 1191 + unsigned long nofile = rlimit(RLIMIT_NOFILE); 1182 1192 int err; 1183 - if (from >= rlimit(RLIMIT_NOFILE)) 1193 + if (from >= nofile) 1184 1194 return -EINVAL; 1185 - err = alloc_fd(from, flags); 1195 + err = alloc_fd(from, nofile, flags); 1186 1196 if (err >= 0) { 1187 1197 get_file(file); 1188 1198 fd_install(err, file);
+1 -1
fs/io_uring.c
··· 4236 4236 4237 4237 /* might be already done during nonblock submission */ 4238 4238 if (!close->put_file) { 4239 - ret = __close_fd_get_file(close->fd, &close->put_file); 4239 + ret = close_fd_get_file(close->fd, &close->put_file); 4240 4240 if (ret < 0) 4241 4241 return (ret == -ENOENT) ? -EBADF : ret; 4242 4242 }
+8 -6
fs/locks.c
··· 2539 2539 */ 2540 2540 if (!error && file_lock->fl_type != F_UNLCK && 2541 2541 !(file_lock->fl_flags & FL_OFDLCK)) { 2542 + struct files_struct *files = current->files; 2542 2543 /* 2543 2544 * We need that spin_lock here - it prevents reordering between 2544 2545 * update of i_flctx->flc_posix and check for it done in 2545 2546 * close(). rcu_read_lock() wouldn't do. 2546 2547 */ 2547 - spin_lock(&current->files->file_lock); 2548 - f = fcheck(fd); 2549 - spin_unlock(&current->files->file_lock); 2548 + spin_lock(&files->file_lock); 2549 + f = files_lookup_fd_locked(files, fd); 2550 + spin_unlock(&files->file_lock); 2550 2551 if (f != filp) { 2551 2552 file_lock->fl_type = F_UNLCK; 2552 2553 error = do_lock_file_wait(filp, cmd, file_lock); ··· 2671 2670 */ 2672 2671 if (!error && file_lock->fl_type != F_UNLCK && 2673 2672 !(file_lock->fl_flags & FL_OFDLCK)) { 2673 + struct files_struct *files = current->files; 2674 2674 /* 2675 2675 * We need that spin_lock here - it prevents reordering between 2676 2676 * update of i_flctx->flc_posix and check for it done in 2677 2677 * close(). rcu_read_lock() wouldn't do. 2678 2678 */ 2679 - spin_lock(&current->files->file_lock); 2680 - f = fcheck(fd); 2681 - spin_unlock(&current->files->file_lock); 2679 + spin_lock(&files->file_lock); 2680 + f = files_lookup_fd_locked(files, fd); 2681 + spin_unlock(&files->file_lock); 2682 2682 if (f != filp) { 2683 2683 file_lock->fl_type = F_UNLCK; 2684 2684 error = do_lock_file_wait(filp, cmd, file_lock);
+1 -1
fs/notify/dnotify/dnotify.c
··· 327 327 } 328 328 329 329 rcu_read_lock(); 330 - f = fcheck(fd); 330 + f = lookup_fd_rcu(fd); 331 331 rcu_read_unlock(); 332 332 333 333 /* if (f != filp) means that we lost a race and another task/thread
+1 -1
fs/open.c
··· 1296 1296 */ 1297 1297 SYSCALL_DEFINE1(close, unsigned int, fd) 1298 1298 { 1299 - int retval = __close_fd(current->files, fd); 1299 + int retval = close_fd(fd); 1300 1300 1301 1301 /* can't restart close syscall because file table entry was cleared */ 1302 1302 if (unlikely(retval == -ERESTARTSYS ||
+15 -33
fs/proc/fd.c
··· 28 28 if (!task) 29 29 return -ENOENT; 30 30 31 - files = get_files_struct(task); 32 - put_task_struct(task); 33 - 31 + task_lock(task); 32 + files = task->files; 34 33 if (files) { 35 34 unsigned int fd = proc_fd(m->private); 36 35 37 36 spin_lock(&files->file_lock); 38 - file = fcheck_files(files, fd); 37 + file = files_lookup_fd_locked(files, fd); 39 38 if (file) { 40 39 struct fdtable *fdt = files_fdtable(files); 41 40 ··· 46 47 ret = 0; 47 48 } 48 49 spin_unlock(&files->file_lock); 49 - put_files_struct(files); 50 50 } 51 + task_unlock(task); 52 + put_task_struct(task); 51 53 52 54 if (ret) 53 55 return ret; ··· 57 57 (long long)file->f_pos, f_flags, 58 58 real_mount(file->f_path.mnt)->mnt_id); 59 59 60 + /* show_fd_locks() never deferences files so a stale value is safe */ 60 61 show_fd_locks(m, file, files); 61 62 if (seq_has_overflowed(m)) 62 63 goto out; ··· 84 83 85 84 static bool tid_fd_mode(struct task_struct *task, unsigned fd, fmode_t *mode) 86 85 { 87 - struct files_struct *files = get_files_struct(task); 88 86 struct file *file; 89 87 90 - if (!files) 91 - return false; 92 - 93 88 rcu_read_lock(); 94 - file = fcheck_files(files, fd); 89 + file = task_lookup_fd_rcu(task, fd); 95 90 if (file) 96 91 *mode = file->f_mode; 97 92 rcu_read_unlock(); 98 - put_files_struct(files); 99 93 return !!file; 100 94 } 101 95 ··· 142 146 143 147 static int proc_fd_link(struct dentry *dentry, struct path *path) 144 148 { 145 - struct files_struct *files = NULL; 146 149 struct task_struct *task; 147 150 int ret = -ENOENT; 148 151 149 152 task = get_proc_task(d_inode(dentry)); 150 153 if (task) { 151 - files = get_files_struct(task); 152 - put_task_struct(task); 153 - } 154 - 155 - if (files) { 156 154 unsigned int fd = proc_fd(d_inode(dentry)); 157 155 struct file *fd_file; 158 156 159 - spin_lock(&files->file_lock); 160 - fd_file = fcheck_files(files, fd); 157 + fd_file = fget_task(task, fd); 161 158 if (fd_file) { 162 159 *path = fd_file->f_path; 163 160 path_get(&fd_file->f_path); 164 161 ret = 0; 162 + fput(fd_file); 165 163 } 166 - spin_unlock(&files->file_lock); 167 - put_files_struct(files); 164 + put_task_struct(task); 168 165 } 169 166 170 167 return ret; ··· 218 229 instantiate_t instantiate) 219 230 { 220 231 struct task_struct *p = get_proc_task(file_inode(file)); 221 - struct files_struct *files; 222 232 unsigned int fd; 223 233 224 234 if (!p) ··· 225 237 226 238 if (!dir_emit_dots(file, ctx)) 227 239 goto out; 228 - files = get_files_struct(p); 229 - if (!files) 230 - goto out; 231 240 232 241 rcu_read_lock(); 233 - for (fd = ctx->pos - 2; 234 - fd < files_fdtable(files)->max_fds; 235 - fd++, ctx->pos++) { 242 + for (fd = ctx->pos - 2;; fd++) { 236 243 struct file *f; 237 244 struct fd_data data; 238 245 char name[10 + 1]; 239 246 unsigned int len; 240 247 241 - f = fcheck_files(files, fd); 248 + f = task_lookup_next_fd_rcu(p, &fd); 249 + ctx->pos = fd + 2LL; 242 250 if (!f) 243 - continue; 251 + break; 244 252 data.mode = f->f_mode; 245 253 rcu_read_unlock(); 246 254 data.fd = fd; ··· 245 261 if (!proc_fill_cache(file, ctx, 246 262 name, len, instantiate, p, 247 263 &data)) 248 - goto out_fd_loop; 264 + goto out; 249 265 cond_resched(); 250 266 rcu_read_lock(); 251 267 } 252 268 rcu_read_unlock(); 253 - out_fd_loop: 254 - put_files_struct(files); 255 269 out: 256 270 put_task_struct(p); 257 271 return 0;
+21 -19
include/linux/fdtable.h
··· 80 80 /* 81 81 * The caller must ensure that fd table isn't shared or hold rcu or file lock 82 82 */ 83 - static inline struct file *__fcheck_files(struct files_struct *files, unsigned int fd) 83 + static inline struct file *files_lookup_fd_raw(struct files_struct *files, unsigned int fd) 84 84 { 85 85 struct fdtable *fdt = rcu_dereference_raw(files->fdt); 86 86 ··· 91 91 return NULL; 92 92 } 93 93 94 - static inline struct file *fcheck_files(struct files_struct *files, unsigned int fd) 94 + static inline struct file *files_lookup_fd_locked(struct files_struct *files, unsigned int fd) 95 95 { 96 - RCU_LOCKDEP_WARN(!rcu_read_lock_held() && 97 - !lockdep_is_held(&files->file_lock), 96 + RCU_LOCKDEP_WARN(!lockdep_is_held(&files->file_lock), 98 97 "suspicious rcu_dereference_check() usage"); 99 - return __fcheck_files(files, fd); 98 + return files_lookup_fd_raw(files, fd); 100 99 } 101 100 102 - /* 103 - * Check whether the specified fd has an open file. 104 - */ 105 - #define fcheck(fd) fcheck_files(current->files, fd) 101 + static inline struct file *files_lookup_fd_rcu(struct files_struct *files, unsigned int fd) 102 + { 103 + RCU_LOCKDEP_WARN(!rcu_read_lock_held(), 104 + "suspicious rcu_dereference_check() usage"); 105 + return files_lookup_fd_raw(files, fd); 106 + } 107 + 108 + static inline struct file *lookup_fd_rcu(unsigned int fd) 109 + { 110 + return files_lookup_fd_rcu(current->files, fd); 111 + } 112 + 113 + struct file *task_lookup_fd_rcu(struct task_struct *task, unsigned int fd); 114 + struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *fd); 106 115 107 116 struct task_struct; 108 117 109 - struct files_struct *get_files_struct(struct task_struct *); 110 118 void put_files_struct(struct files_struct *fs); 111 - void reset_files_struct(struct files_struct *); 112 - int unshare_files(struct files_struct **); 119 + int unshare_files(void); 113 120 struct files_struct *dup_fd(struct files_struct *, unsigned, int *) __latent_entropy; 114 121 void do_close_on_exec(struct files_struct *); 115 122 int iterate_fd(struct files_struct *, unsigned, 116 123 int (*)(const void *, struct file *, unsigned), 117 124 const void *); 118 125 119 - extern int __alloc_fd(struct files_struct *files, 120 - unsigned start, unsigned end, unsigned flags); 121 - extern void __fd_install(struct files_struct *files, 122 - unsigned int fd, struct file *file); 123 - extern int __close_fd(struct files_struct *files, 124 - unsigned int fd); 126 + extern int close_fd(unsigned int fd); 125 127 extern int __close_range(unsigned int fd, unsigned int max_fd, unsigned int flags); 126 - extern int __close_fd_get_file(unsigned int fd, struct file **res); 128 + extern int close_fd_get_file(unsigned int fd, struct file **res); 127 129 extern int unshare_fd(unsigned long unshare_flags, unsigned int max_fds, 128 130 struct files_struct **new_fdp); 129 131
-12
include/linux/syscalls.h
··· 1295 1295 return do_sys_ftruncate(fd, length, 1); 1296 1296 } 1297 1297 1298 - extern int __close_fd(struct files_struct *files, unsigned int fd); 1299 - 1300 - /* 1301 - * In contrast to sys_close(), this stub does not check whether the syscall 1302 - * should or should not be restarted, but returns the raw error codes from 1303 - * __close_fd(). 1304 - */ 1305 - static inline int ksys_close(unsigned int fd) 1306 - { 1307 - return __close_fd(current->files, fd); 1308 - } 1309 - 1310 1298 extern long do_sys_truncate(const char __user *pathname, loff_t length); 1311 1299 1312 1300 static inline long ksys_truncate(const char __user *pathname, loff_t length)
+3 -17
kernel/bpf/syscall.c
··· 3874 3874 pid_t pid = attr->task_fd_query.pid; 3875 3875 u32 fd = attr->task_fd_query.fd; 3876 3876 const struct perf_event *event; 3877 - struct files_struct *files; 3878 3877 struct task_struct *task; 3879 3878 struct file *file; 3880 3879 int err; ··· 3891 3892 if (!task) 3892 3893 return -ENOENT; 3893 3894 3894 - files = get_files_struct(task); 3895 - put_task_struct(task); 3896 - if (!files) 3897 - return -ENOENT; 3898 - 3899 3895 err = 0; 3900 - spin_lock(&files->file_lock); 3901 - file = fcheck_files(files, fd); 3896 + file = fget_task(task, fd); 3897 + put_task_struct(task); 3902 3898 if (!file) 3903 - err = -EBADF; 3904 - else 3905 - get_file(file); 3906 - spin_unlock(&files->file_lock); 3907 - put_files_struct(files); 3908 - 3909 - if (err) 3910 - goto out; 3899 + return -EBADF; 3911 3900 3912 3901 if (file->f_op == &bpf_link_fops) { 3913 3902 struct bpf_link *link = file->private_data; ··· 3935 3948 err = -ENOTSUPP; 3936 3949 put_file: 3937 3950 fput(file); 3938 - out: 3939 3951 return err; 3940 3952 } 3941 3953
+7 -26
kernel/bpf/task_iter.c
··· 130 130 */ 131 131 struct bpf_iter_seq_task_common common; 132 132 struct task_struct *task; 133 - struct files_struct *files; 134 133 u32 tid; 135 134 u32 fd; 136 135 }; ··· 138 139 task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info) 139 140 { 140 141 struct pid_namespace *ns = info->common.ns; 141 - u32 curr_tid = info->tid, max_fds; 142 - struct files_struct *curr_files; 142 + u32 curr_tid = info->tid; 143 143 struct task_struct *curr_task; 144 - int curr_fd = info->fd; 144 + unsigned int curr_fd = info->fd; 145 145 146 146 /* If this function returns a non-NULL file object, 147 - * it held a reference to the task/files_struct/file. 147 + * it held a reference to the task/file. 148 148 * Otherwise, it does not hold any reference. 149 149 */ 150 150 again: 151 151 if (info->task) { 152 152 curr_task = info->task; 153 - curr_files = info->files; 154 153 curr_fd = info->fd; 155 154 } else { 156 155 curr_task = task_seq_get_next(ns, &curr_tid, true); 157 156 if (!curr_task) { 158 157 info->task = NULL; 159 - info->files = NULL; 160 158 return NULL; 161 159 } 162 160 163 - curr_files = get_files_struct(curr_task); 164 - if (!curr_files) { 165 - put_task_struct(curr_task); 166 - curr_tid = ++(info->tid); 167 - info->fd = 0; 168 - goto again; 169 - } 170 - 171 - info->files = curr_files; 161 + /* set info->task and info->tid */ 172 162 info->task = curr_task; 173 163 if (curr_tid == info->tid) { 174 164 curr_fd = info->fd; ··· 168 180 } 169 181 170 182 rcu_read_lock(); 171 - max_fds = files_fdtable(curr_files)->max_fds; 172 - for (; curr_fd < max_fds; curr_fd++) { 183 + for (;; curr_fd++) { 173 184 struct file *f; 174 - 175 - f = fcheck_files(curr_files, curr_fd); 185 + f = task_lookup_next_fd_rcu(curr_task, &curr_fd); 176 186 if (!f) 177 - continue; 187 + break; 178 188 if (!get_file_rcu(f)) 179 189 continue; 180 190 ··· 184 198 185 199 /* the current task is done, go to the next task */ 186 200 rcu_read_unlock(); 187 - put_files_struct(curr_files); 188 201 put_task_struct(curr_task); 189 202 info->task = NULL; 190 - info->files = NULL; 191 203 info->fd = 0; 192 204 curr_tid = ++(info->tid); 193 205 goto again; ··· 197 213 struct file *file; 198 214 199 215 info->task = NULL; 200 - info->files = NULL; 201 216 file = task_file_seq_get_next(info); 202 217 if (file && *pos == 0) 203 218 ++*pos; ··· 258 275 (void)__task_file_seq_show(seq, v, true); 259 276 } else { 260 277 fput((struct file *)v); 261 - put_files_struct(info->files); 262 278 put_task_struct(info->task); 263 - info->files = NULL; 264 279 info->task = NULL; 265 280 } 266 281 }
+6 -6
kernel/fork.c
··· 3031 3031 * the exec layer of the kernel. 3032 3032 */ 3033 3033 3034 - int unshare_files(struct files_struct **displaced) 3034 + int unshare_files(void) 3035 3035 { 3036 3036 struct task_struct *task = current; 3037 - struct files_struct *copy = NULL; 3037 + struct files_struct *old, *copy = NULL; 3038 3038 int error; 3039 3039 3040 3040 error = unshare_fd(CLONE_FILES, NR_OPEN_MAX, &copy); 3041 - if (error || !copy) { 3042 - *displaced = NULL; 3041 + if (error || !copy) 3043 3042 return error; 3044 - } 3045 - *displaced = task->files; 3043 + 3044 + old = task->files; 3046 3045 task_lock(task); 3047 3046 task->files = copy; 3048 3047 task_unlock(task); 3048 + put_files_struct(old); 3049 3049 return 0; 3050 3050 } 3051 3051
+6 -23
kernel/kcmp.c
··· 61 61 static struct file * 62 62 get_file_raw_ptr(struct task_struct *task, unsigned int idx) 63 63 { 64 - struct file *file = NULL; 64 + struct file *file; 65 65 66 - task_lock(task); 67 66 rcu_read_lock(); 68 - 69 - if (task->files) 70 - file = fcheck_files(task->files, idx); 71 - 67 + file = task_lookup_fd_rcu(task, idx); 72 68 rcu_read_unlock(); 73 - task_unlock(task); 74 69 75 70 return file; 76 71 } ··· 102 107 { 103 108 struct file *filp, *filp_epoll, *filp_tgt; 104 109 struct kcmp_epoll_slot slot; 105 - struct files_struct *files; 106 110 107 111 if (copy_from_user(&slot, uslot, sizeof(slot))) 108 112 return -EFAULT; ··· 110 116 if (!filp) 111 117 return -EBADF; 112 118 113 - files = get_files_struct(task2); 114 - if (!files) 119 + filp_epoll = fget_task(task2, slot.efd); 120 + if (!filp_epoll) 115 121 return -EBADF; 116 122 117 - spin_lock(&files->file_lock); 118 - filp_epoll = fcheck_files(files, slot.efd); 119 - if (filp_epoll) 120 - get_file(filp_epoll); 121 - else 122 - filp_tgt = ERR_PTR(-EBADF); 123 - spin_unlock(&files->file_lock); 124 - put_files_struct(files); 125 - 126 - if (filp_epoll) { 127 - filp_tgt = get_epoll_tfile_raw_ptr(filp_epoll, slot.tfd, slot.toff); 128 - fput(filp_epoll); 129 - } 123 + filp_tgt = get_epoll_tfile_raw_ptr(filp_epoll, slot.tfd, slot.toff); 124 + fput(filp_epoll); 130 125 131 126 if (IS_ERR(filp_tgt)) 132 127 return PTR_ERR(filp_tgt);