Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Replace the fd_sets in struct fdtable with an array of unsigned longs

Replace the fd_sets in struct fdtable with an array of unsigned longs and then
use the standard non-atomic bit operations rather than the FD_* macros.

This:

(1) Removes the abuses of struct fd_set:

(a) Since we don't want to allocate a full fd_set the vast majority of the
time, we actually, in effect, just allocate a just-big-enough array of
unsigned longs and cast it to an fd_set type - so why bother with the
fd_set at all?

(b) Some places outside of the core fdtable handling code (such as
SELinux) want to look inside the array of unsigned longs hidden inside
the fd_set struct for more efficient iteration over the entire set.

(2) Eliminates the use of FD_*() macros in the kernel completely.

(3) Permits the __FD_*() macros to be deleted entirely where not exposed to
userspace.

Signed-off-by: David Howells <dhowells@redhat.com>
Link: http://lkml.kernel.org/r/20120216174954.23314.48147.stgit@warthog.procyon.org.uk
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>

authored by

David Howells and committed by
H. Peter Anvin
1fd36adc 1dce27c5

+37 -47
+2 -2
fs/exec.c
··· 1026 1026 fdt = files_fdtable(files); 1027 1027 if (i >= fdt->max_fds) 1028 1028 break; 1029 - set = fdt->close_on_exec->fds_bits[j]; 1029 + set = fdt->close_on_exec[j]; 1030 1030 if (!set) 1031 1031 continue; 1032 - fdt->close_on_exec->fds_bits[j] = 0; 1032 + fdt->close_on_exec[j] = 0; 1033 1033 spin_unlock(&files->file_lock); 1034 1034 for ( ; set ; i++,set >>= 1) { 1035 1035 if (set & 1) {
+22 -24
fs/file.c
··· 40 40 */ 41 41 static DEFINE_PER_CPU(struct fdtable_defer, fdtable_defer_list); 42 42 43 - static void *alloc_fdmem(unsigned int size) 43 + static void *alloc_fdmem(size_t size) 44 44 { 45 45 /* 46 46 * Very large allocations can stress page reclaim, so fall back to ··· 142 142 static struct fdtable * alloc_fdtable(unsigned int nr) 143 143 { 144 144 struct fdtable *fdt; 145 - char *data; 145 + void *data; 146 146 147 147 /* 148 148 * Figure out how many fds we actually want to support in this fdtable. ··· 172 172 data = alloc_fdmem(nr * sizeof(struct file *)); 173 173 if (!data) 174 174 goto out_fdt; 175 - fdt->fd = (struct file **)data; 176 - data = alloc_fdmem(max_t(unsigned int, 175 + fdt->fd = data; 176 + 177 + data = alloc_fdmem(max_t(size_t, 177 178 2 * nr / BITS_PER_BYTE, L1_CACHE_BYTES)); 178 179 if (!data) 179 180 goto out_arr; 180 - fdt->open_fds = (fd_set *)data; 181 - data += nr / BITS_PER_BYTE; 182 - fdt->close_on_exec = (fd_set *)data; 181 + fdt->open_fds = data; 182 + data += nr / BITS_PER_LONG; 183 + fdt->close_on_exec = data; 183 184 fdt->next = NULL; 184 185 185 186 return fdt; ··· 276 275 int i; 277 276 278 277 /* Find the last open fd */ 279 - for (i = size/(8*sizeof(long)); i > 0; ) { 280 - if (fdt->open_fds->fds_bits[--i]) 278 + for (i = size / BITS_PER_LONG; i > 0; ) { 279 + if (fdt->open_fds[--i]) 281 280 break; 282 281 } 283 - i = (i+1) * 8 * sizeof(long); 282 + i = (i + 1) * BITS_PER_LONG; 284 283 return i; 285 284 } 286 285 ··· 307 306 newf->next_fd = 0; 308 307 new_fdt = &newf->fdtab; 309 308 new_fdt->max_fds = NR_OPEN_DEFAULT; 310 - new_fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init; 311 - new_fdt->open_fds = (fd_set *)&newf->open_fds_init; 309 + new_fdt->close_on_exec = newf->close_on_exec_init; 310 + new_fdt->open_fds = newf->open_fds_init; 312 311 new_fdt->fd = &newf->fd_array[0]; 313 312 new_fdt->next = NULL; 314 313 ··· 351 350 old_fds = old_fdt->fd; 352 351 new_fds = new_fdt->fd; 353 352 354 - memcpy(new_fdt->open_fds->fds_bits, 355 - old_fdt->open_fds->fds_bits, open_files/8); 356 - memcpy(new_fdt->close_on_exec->fds_bits, 357 - old_fdt->close_on_exec->fds_bits, open_files/8); 353 + memcpy(new_fdt->open_fds, old_fdt->open_fds, open_files / 8); 354 + memcpy(new_fdt->close_on_exec, old_fdt->close_on_exec, open_files / 8); 358 355 359 356 for (i = open_files; i != 0; i--) { 360 357 struct file *f = *old_fds++; ··· 378 379 memset(new_fds, 0, size); 379 380 380 381 if (new_fdt->max_fds > open_files) { 381 - int left = (new_fdt->max_fds-open_files)/8; 382 - int start = open_files / (8 * sizeof(unsigned long)); 382 + int left = (new_fdt->max_fds - open_files) / 8; 383 + int start = open_files / BITS_PER_LONG; 383 384 384 - memset(&new_fdt->open_fds->fds_bits[start], 0, left); 385 - memset(&new_fdt->close_on_exec->fds_bits[start], 0, left); 385 + memset(&new_fdt->open_fds[start], 0, left); 386 + memset(&new_fdt->close_on_exec[start], 0, left); 386 387 } 387 388 388 389 rcu_assign_pointer(newf->fdt, new_fdt); ··· 418 419 .fdtab = { 419 420 .max_fds = NR_OPEN_DEFAULT, 420 421 .fd = &init_files.fd_array[0], 421 - .close_on_exec = (fd_set *)&init_files.close_on_exec_init, 422 - .open_fds = (fd_set *)&init_files.open_fds_init, 422 + .close_on_exec = init_files.close_on_exec_init, 423 + .open_fds = init_files.open_fds_init, 423 424 }, 424 425 .file_lock = __SPIN_LOCK_UNLOCKED(init_task.file_lock), 425 426 }; ··· 442 443 fd = files->next_fd; 443 444 444 445 if (fd < fdt->max_fds) 445 - fd = find_next_zero_bit(fdt->open_fds->fds_bits, 446 - fdt->max_fds, fd); 446 + fd = find_next_zero_bit(fdt->open_fds, fdt->max_fds, fd); 447 447 448 448 error = expand_files(files, fd); 449 449 if (error < 0)
+1 -1
fs/select.c
··· 348 348 set = ~(~0UL << (n & (__NFDBITS-1))); 349 349 n /= __NFDBITS; 350 350 fdt = files_fdtable(current->files); 351 - open_fds = fdt->open_fds->fds_bits+n; 351 + open_fds = fdt->open_fds + n; 352 352 max = 0; 353 353 if (set) { 354 354 set &= BITS(fds, n);
+10 -18
include/linux/fdtable.h
··· 21 21 */ 22 22 #define NR_OPEN_DEFAULT BITS_PER_LONG 23 23 24 - /* 25 - * The embedded_fd_set is a small fd_set, 26 - * suitable for most tasks (which open <= BITS_PER_LONG files) 27 - */ 28 - struct embedded_fd_set { 29 - unsigned long fds_bits[1]; 30 - }; 31 - 32 24 struct fdtable { 33 25 unsigned int max_fds; 34 26 struct file __rcu **fd; /* current fd array */ 35 - fd_set *close_on_exec; 36 - fd_set *open_fds; 27 + unsigned long *close_on_exec; 28 + unsigned long *open_fds; 37 29 struct rcu_head rcu; 38 30 struct fdtable *next; 39 31 }; 40 32 41 33 static inline void __set_close_on_exec(int fd, struct fdtable *fdt) 42 34 { 43 - FD_SET(fd, fdt->close_on_exec); 35 + __set_bit(fd, fdt->close_on_exec); 44 36 } 45 37 46 38 static inline void __clear_close_on_exec(int fd, struct fdtable *fdt) 47 39 { 48 - FD_CLR(fd, fdt->close_on_exec); 40 + __clear_bit(fd, fdt->close_on_exec); 49 41 } 50 42 51 43 static inline bool close_on_exec(int fd, const struct fdtable *fdt) 52 44 { 53 - return FD_ISSET(fd, fdt->close_on_exec); 45 + return test_bit(fd, fdt->close_on_exec); 54 46 } 55 47 56 48 static inline void __set_open_fd(int fd, struct fdtable *fdt) 57 49 { 58 - FD_SET(fd, fdt->open_fds); 50 + __set_bit(fd, fdt->open_fds); 59 51 } 60 52 61 53 static inline void __clear_open_fd(int fd, struct fdtable *fdt) 62 54 { 63 - FD_CLR(fd, fdt->open_fds); 55 + __clear_bit(fd, fdt->open_fds); 64 56 } 65 57 66 58 static inline bool fd_is_open(int fd, const struct fdtable *fdt) 67 59 { 68 - return FD_ISSET(fd, fdt->open_fds); 60 + return test_bit(fd, fdt->open_fds); 69 61 } 70 62 71 63 /* ··· 75 83 */ 76 84 spinlock_t file_lock ____cacheline_aligned_in_smp; 77 85 int next_fd; 78 - struct embedded_fd_set close_on_exec_init; 79 - struct embedded_fd_set open_fds_init; 86 + unsigned long close_on_exec_init[1]; 87 + unsigned long open_fds_init[1]; 80 88 struct file __rcu * fd_array[NR_OPEN_DEFAULT]; 81 89 }; 82 90
+1 -1
kernel/exit.c
··· 473 473 i = j * __NFDBITS; 474 474 if (i >= fdt->max_fds) 475 475 break; 476 - set = fdt->open_fds->fds_bits[j++]; 476 + set = fdt->open_fds[j++]; 477 477 while (set) { 478 478 if (set & 1) { 479 479 struct file * file = xchg(&fdt->fd[i], NULL);
+1 -1
security/selinux/hooks.c
··· 2145 2145 fdt = files_fdtable(files); 2146 2146 if (i >= fdt->max_fds) 2147 2147 break; 2148 - set = fdt->open_fds->fds_bits[j]; 2148 + set = fdt->open_fds[j]; 2149 2149 if (!set) 2150 2150 continue; 2151 2151 spin_unlock(&files->file_lock);