···152152 * 'unsigned long' in some places, but simply because that is how the Linux153153 * kernel bitmaps are defined to work: they are not "bits in an array of bytes",154154 * they are very much "bits in an array of unsigned long".155155- *156156- * The ALIGN(nr, BITS_PER_LONG) here is for clarity: since we just multiplied157157- * by that "1024/sizeof(ptr)" before, we already know there are sufficient158158- * clear low bits. Clang seems to realize that, gcc ends up being confused.159159- *160160- * On a 128-bit machine, the ALIGN() would actually matter. In the meantime,161161- * let's consider it documentation (and maybe a test-case for gcc to improve162162- * its code generation ;)163155 */164164-static struct fdtable * alloc_fdtable(unsigned int nr)156156+static struct fdtable *alloc_fdtable(unsigned int slots_wanted)165157{166158 struct fdtable *fdt;159159+ unsigned int nr;167160 void *data;168161169162 /*···164171 * Allocation steps are keyed to the size of the fdarray, since it165172 * grows far faster than any of the other dynamic data. We try to fit166173 * the fdarray into comfortable page-tuned chunks: starting at 1024B167167- * and growing in powers of two from there on.174174+ * and growing in powers of two from there on. Since we called only175175+ * with slots_wanted > BITS_PER_LONG (embedded instance in files->fdtab176176+ * already gives BITS_PER_LONG slots), the above boils down to177177+ * 1. use the smallest power of two large enough to give us that many178178+ * slots.179179+ * 2. on 32bit skip 64 and 128 - the minimal capacity we want there is180180+ * 256 slots (i.e. 1Kb fd array).181181+ * 3. on 64bit don't skip anything, 1Kb fd array means 128 slots there182182+ * and we are never going to be asked for 64 or less.168183 */169169- nr /= (1024 / sizeof(struct file *));170170- nr = roundup_pow_of_two(nr + 1);171171- nr *= (1024 / sizeof(struct file *));172172- nr = ALIGN(nr, BITS_PER_LONG);184184+ if (IS_ENABLED(CONFIG_32BIT) && slots_wanted < 256)185185+ nr = 256;186186+ else187187+ nr = roundup_pow_of_two(slots_wanted);173188 /*174189 * Note that this can drive nr *below* what we had passed if sysctl_nr_open175175- * had been set lower between the check in expand_files() and here. Deal176176- * with that in caller, it's cheaper that way.190190+ * had been set lower between the check in expand_files() and here.177191 *178192 * We make sure that nr remains a multiple of BITS_PER_LONG - otherwise179193 * bitmaps handling below becomes unpleasant, to put it mildly...180194 */181181- if (unlikely(nr > sysctl_nr_open))182182- nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1;195195+ if (unlikely(nr > sysctl_nr_open)) {196196+ nr = round_down(sysctl_nr_open, BITS_PER_LONG);197197+ if (nr < slots_wanted)198198+ return ERR_PTR(-EMFILE);199199+ }183200184201 fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_ACCOUNT);185202 if (!fdt)···218215out_fdt:219216 kfree(fdt);220217out:221221- return NULL;218218+ return ERR_PTR(-ENOMEM);222219}223220224221/*225222 * Expand the file descriptor table.226223 * This function will allocate a new fdtable and both fd array and fdset, of227224 * the given size.228228- * Return <0 error code on error; 1 on successful completion.225225+ * Return <0 error code on error; 0 on successful completion.229226 * The files->file_lock should be held on entry, and will be held on exit.230227 */231228static int expand_fdtable(struct files_struct *files, unsigned int nr)···235232 struct fdtable *new_fdt, *cur_fdt;236233237234 spin_unlock(&files->file_lock);238238- new_fdt = alloc_fdtable(nr);235235+ new_fdt = alloc_fdtable(nr + 1);239236240237 /* make sure all fd_install() have seen resize_in_progress241238 * or have finished their rcu_read_lock_sched() section.···244241 synchronize_rcu();245242246243 spin_lock(&files->file_lock);247247- if (!new_fdt)248248- return -ENOMEM;249249- /*250250- * extremely unlikely race - sysctl_nr_open decreased between the check in251251- * caller and alloc_fdtable(). Cheaper to catch it here...252252- */253253- if (unlikely(new_fdt->max_fds <= nr)) {254254- __free_fdtable(new_fdt);255255- return -EMFILE;256256- }244244+ if (IS_ERR(new_fdt))245245+ return PTR_ERR(new_fdt);257246 cur_fdt = files_fdtable(files);258247 BUG_ON(nr < cur_fdt->max_fds);259248 copy_fdtable(new_fdt, cur_fdt);···254259 call_rcu(&cur_fdt->rcu, free_fdtable_rcu);255260 /* coupled with smp_rmb() in fd_install() */256261 smp_wmb();257257- return 1;262262+ return 0;258263}259264260265/*261266 * Expand files.262267 * This function will expand the file structures, if the requested size exceeds263268 * the current capacity and there is room for expansion.264264- * Return <0 error code on error; 0 when nothing done; 1 when files were265265- * expanded and execution may have blocked.269269+ * Return <0 error code on error; 0 on success.266270 * The files->file_lock should be held on entry, and will be held on exit.267271 */268272static int expand_files(struct files_struct *files, unsigned int nr)···269275 __acquires(files->file_lock)270276{271277 struct fdtable *fdt;272272- int expanded = 0;278278+ int error;273279274280repeat:275281 fdt = files_fdtable(files);276282277283 /* Do we need to expand? */278284 if (nr < fdt->max_fds)279279- return expanded;285285+ return 0;280286281287 /* Can we expand? */282288 if (nr >= sysctl_nr_open)···284290285291 if (unlikely(files->resize_in_progress)) {286292 spin_unlock(&files->file_lock);287287- expanded = 1;288293 wait_event(files->resize_wait, !files->resize_in_progress);289294 spin_lock(&files->file_lock);290295 goto repeat;···291298292299 /* All good, so we try */293300 files->resize_in_progress = true;294294- expanded = expand_fdtable(files, nr);301301+ error = expand_fdtable(files, nr);295302 files->resize_in_progress = false;296303297304 wake_up_all(&files->resize_wait);298298- return expanded;305305+ return error;299306}300307301301-static inline void __set_close_on_exec(unsigned int fd, struct fdtable *fdt)308308+static inline void __set_close_on_exec(unsigned int fd, struct fdtable *fdt,309309+ bool set)302310{303303- __set_bit(fd, fdt->close_on_exec);311311+ if (set) {312312+ __set_bit(fd, fdt->close_on_exec);313313+ } else {314314+ if (test_bit(fd, fdt->close_on_exec))315315+ __clear_bit(fd, fdt->close_on_exec);316316+ }304317}305318306306-static inline void __clear_close_on_exec(unsigned int fd, struct fdtable *fdt)307307-{308308- if (test_bit(fd, fdt->close_on_exec))309309- __clear_bit(fd, fdt->close_on_exec);310310-}311311-312312-static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt)319319+static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt, bool set)313320{314321 __set_bit(fd, fdt->open_fds);322322+ __set_close_on_exec(fd, fdt, set);315323 fd /= BITS_PER_LONG;316324 if (!~fdt->open_fds[fd])317325 __set_bit(fd, fdt->full_fds_bits);···321327static inline void __clear_open_fd(unsigned int fd, struct fdtable *fdt)322328{323329 __clear_bit(fd, fdt->open_fds);324324- __clear_bit(fd / BITS_PER_LONG, fdt->full_fds_bits);330330+ fd /= BITS_PER_LONG;331331+ if (test_bit(fd, fdt->full_fds_bits))332332+ __clear_bit(fd, fdt->full_fds_bits);325333}326334327335static inline bool fd_is_open(unsigned int fd, const struct fdtable *fdt)···365369 struct file **old_fds, **new_fds;366370 unsigned int open_files, i;367371 struct fdtable *old_fdt, *new_fdt;368368- int error;369372370373 newf = kmem_cache_alloc(files_cachep, GFP_KERNEL);371374 if (!newf)···396401 if (new_fdt != &newf->fdtab)397402 __free_fdtable(new_fdt);398403399399- new_fdt = alloc_fdtable(open_files - 1);400400- if (!new_fdt) {401401- error = -ENOMEM;402402- goto out_release;403403- }404404-405405- /* beyond sysctl_nr_open; nothing to do */406406- if (unlikely(new_fdt->max_fds < open_files)) {407407- __free_fdtable(new_fdt);408408- error = -EMFILE;409409- goto out_release;404404+ new_fdt = alloc_fdtable(open_files);405405+ if (IS_ERR(new_fdt)) {406406+ kmem_cache_free(files_cachep, newf);407407+ return ERR_CAST(new_fdt);410408 }411409412410 /*···440452 rcu_assign_pointer(newf->fdt, new_fdt);441453442454 return newf;443443-444444-out_release:445445- kmem_cache_free(files_cachep, newf);446446- return ERR_PTR(error);447455}448456449457static struct fdtable *close_files(struct files_struct * files)···460476 set = fdt->open_fds[j++];461477 while (set) {462478 if (set & 1) {463463- struct file * file = xchg(&fdt->fd[i], NULL);479479+ struct file *file = fdt->fd[i];464480 if (file) {465481 filp_close(file, files);466482 cond_resched();···517533 unsigned int maxfd = fdt->max_fds; /* always multiple of BITS_PER_LONG */518534 unsigned int maxbit = maxfd / BITS_PER_LONG;519535 unsigned int bitbit = start / BITS_PER_LONG;536536+ unsigned int bit;537537+538538+ /*539539+ * Try to avoid looking at the second level bitmap540540+ */541541+ bit = find_next_zero_bit(&fdt->open_fds[bitbit], BITS_PER_LONG,542542+ start & (BITS_PER_LONG - 1));543543+ if (bit < BITS_PER_LONG)544544+ return bit + bitbit * BITS_PER_LONG;520545521546 bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG;522547 if (bitbit >= maxfd)···552559 if (fd < files->next_fd)553560 fd = files->next_fd;554561555555- if (fd < fdt->max_fds)562562+ if (likely(fd < fdt->max_fds))556563 fd = find_next_fd(fdt, fd);557564558565 /*···560567 * will limit the total number of files that can be opened.561568 */562569 error = -EMFILE;563563- if (fd >= end)570570+ if (unlikely(fd >= end))564571 goto out;565572566566- error = expand_files(files, fd);567567- if (error < 0)568568- goto out;573573+ if (unlikely(fd >= fdt->max_fds)) {574574+ error = expand_files(files, fd);575575+ if (error < 0)576576+ goto out;569577570570- /*571571- * If we needed to expand the fs array we572572- * might have blocked - try again.573573- */574574- if (error)575578 goto repeat;579579+ }576580577581 if (start <= files->next_fd)578582 files->next_fd = fd + 1;579583580580- __set_open_fd(fd, fdt);581581- if (flags & O_CLOEXEC)582582- __set_close_on_exec(fd, fdt);583583- else584584- __clear_close_on_exec(fd, fdt);584584+ __set_open_fd(fd, fdt, flags & O_CLOEXEC);585585 error = fd;586586-#if 1587587- /* Sanity check */588588- if (rcu_access_pointer(fdt->fd[fd]) != NULL) {589589- printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd);590590- rcu_assign_pointer(fdt->fd[fd], NULL);591591- }592592-#endif593586594587out:595588 spin_unlock(&files->file_lock);···641662 rcu_read_unlock_sched();642663 spin_lock(&files->file_lock);643664 fdt = files_fdtable(files);644644- BUG_ON(fdt->fd[fd] != NULL);665665+ WARN_ON(fdt->fd[fd] != NULL);645666 rcu_assign_pointer(fdt->fd[fd], file);646667 spin_unlock(&files->file_lock);647668 return;···755776}756777757778/**758758- * __close_range() - Close all file descriptors in a given range.779779+ * sys_close_range() - Close all file descriptors in a given range.759780 *760781 * @fd: starting file descriptor to close761782 * @max_fd: last file descriptor to close···763784 *764785 * This closes a range of file descriptors. All file descriptors765786 * from @fd up to and including @max_fd are closed.787787+ * Currently, errors to close a given file descriptor are ignored.766788 */767767-int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)789789+SYSCALL_DEFINE3(close_range, unsigned int, fd, unsigned int, max_fd,790790+ unsigned int, flags)768791{769792 struct task_struct *me = current;770793 struct files_struct *cur_fds = me->files, *fds = NULL;···10811100 return file;10821101}1083110210841084-struct file *lookup_fdget_rcu(unsigned int fd)10851085-{10861086- return __fget_files_rcu(current->files, fd, 0);10871087-10881088-}10891089-EXPORT_SYMBOL_GPL(lookup_fdget_rcu);10901090-10911091-struct file *task_lookup_fdget_rcu(struct task_struct *task, unsigned int fd)10921092-{10931093- /* Must be called with rcu_read_lock held */10941094- struct files_struct *files;10951095- struct file *file = NULL;10961096-10971097- task_lock(task);10981098- files = task->files;10991099- if (files)11001100- file = __fget_files_rcu(files, fd, 0);11011101- task_unlock(task);11021102-11031103- return file;11041104-}11051105-11061106-struct file *task_lookup_next_fdget_rcu(struct task_struct *task, unsigned int *ret_fd)11031103+struct file *fget_task_next(struct task_struct *task, unsigned int *ret_fd)11071104{11081105 /* Must be called with rcu_read_lock held */11091106 struct files_struct *files;···10911132 task_lock(task);10921133 files = task->files;10931134 if (files) {11351135+ rcu_read_lock();10941136 for (; fd < files_fdtable(files)->max_fds; fd++) {10951137 file = __fget_files_rcu(files, fd, 0);10961138 if (file)10971139 break;10981140 }11411141+ rcu_read_unlock();10991142 }11001143 task_unlock(task);11011144 *ret_fd = fd;11021145 return file;11031146}11041104-EXPORT_SYMBOL(task_lookup_next_fdget_rcu);11471147+EXPORT_SYMBOL(fget_task_next);1105114811061149/*11071150 * Lightweight file lookup - no refcnt increment if fd table isn't shared.···12001239void set_close_on_exec(unsigned int fd, int flag)12011240{12021241 struct files_struct *files = current->files;12031203- struct fdtable *fdt;12041242 spin_lock(&files->file_lock);12051205- fdt = files_fdtable(files);12061206- if (flag)12071207- __set_close_on_exec(fd, fdt);12081208- else12091209- __clear_close_on_exec(fd, fdt);12431243+ __set_close_on_exec(fd, files_fdtable(files), flag);12101244 spin_unlock(&files->file_lock);12111245}12121246···12421286 goto Ebusy;12431287 get_file(file);12441288 rcu_assign_pointer(fdt->fd[fd], file);12451245- __set_open_fd(fd, fdt);12461246- if (flags & O_CLOEXEC)12471247- __set_close_on_exec(fd, fdt);12481248- else12491249- __clear_close_on_exec(fd, fdt);12891289+ __set_open_fd(fd, fdt, flags & O_CLOEXEC);12501290 spin_unlock(&files->file_lock);1251129112521292 if (tofree)
···1616#include <linux/security.h>1717#include <linux/spinlock.h>1818#include <linux/slab.h>1919-#include <linux/fdtable.h>2019#include <linux/fsnotify_backend.h>21202221static int dir_notify_enable __read_mostly = 1;···346347 new_fsn_mark = NULL;347348 }348349349349- rcu_read_lock();350350- f = lookup_fdget_rcu(fd);351351- rcu_read_unlock();350350+ f = fget_raw(fd);352351353352 /* if (f != filp) means that we lost a race and another task/thread354353 * actually closed the fd we are still playing with before we grabbed
···15741574 return retval;15751575}1576157615771577-/**15781578- * sys_close_range() - Close all file descriptors in a given range.15791579- *15801580- * @fd: starting file descriptor to close15811581- * @max_fd: last file descriptor to close15821582- * @flags: reserved for future extensions15831583- *15841584- * This closes a range of file descriptors. All file descriptors15851585- * from @fd up to and including @max_fd are closed.15861586- * Currently, errors to close a given file descriptor are ignored.15871587- */15881588-SYSCALL_DEFINE3(close_range, unsigned int, fd, unsigned int, max_fd,15891589- unsigned int, flags)15901590-{15911591- return __close_range(fd, max_fd, flags);15921592-}15931593-15941577/*15951578 * This routine simulates a hangup on the tty, to arrange that users15961579 * are given clean terminals at login time.
···55#include <linux/namei.h>66#include <linux/pid_namespace.h>77#include <linux/fs.h>88-#include <linux/fdtable.h>98#include <linux/filter.h>109#include <linux/bpf_mem_alloc.h>1110#include <linux/btf_ids.h>···285286 curr_fd = 0;286287 }287288288288- rcu_read_lock();289289- f = task_lookup_next_fdget_rcu(curr_task, &curr_fd);289289+ f = fget_task_next(curr_task, &curr_fd);290290 if (f) {291291 /* set info->fd */292292 info->fd = curr_fd;293293- rcu_read_unlock();294293 return f;295294 }296295297296 /* the current task is done, go to the next task */298298- rcu_read_unlock();299297 put_task_struct(curr_task);300298301299 if (info->common.type == BPF_TASK_ITER_TID) {