Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

[PATCH] files: files struct with RCU

Patch to eliminate struct files_struct.file_lock spinlock on the reader side
and use rcu refcounting rcuref_xxx api for the f_count refcounter. The
updates to the fdtable are done by allocating a new fdtable structure and
setting files->fdt to point to the new structure. The fdtable structure is
protected by RCU thereby allowing lock-free lookup. For fd arrays/sets that
are vmalloced, we use keventd to free them since RCU callbacks can't sleep. A
global list of fdtable to be freed is not scalable, so we use a per-cpu list.
If keventd is already handling the current cpu's work, we use a timer to defer
queueing of that work.

Since the last publication, this patch has been re-written to avoid using
explicit memory barriers and use rcu_assign_pointer(), rcu_dereference()
premitives instead. This required that the fd information is kept in a
separate structure (fdtable) and updated atomically.

Signed-off-by: Dipankar Sarma <dipankar@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

authored by

Dipankar Sarma and committed by
Linus Torvalds
ab2af1f5 6e72ad2c

+354 -175
+2 -1
fs/aio.c
··· 29 29 #include <linux/highmem.h> 30 30 #include <linux/workqueue.h> 31 31 #include <linux/security.h> 32 + #include <linux/rcuref.h> 32 33 33 34 #include <asm/kmap_types.h> 34 35 #include <asm/uaccess.h> ··· 500 499 /* Must be done under the lock to serialise against cancellation. 501 500 * Call this aio_fput as it duplicates fput via the fput_work. 502 501 */ 503 - if (unlikely(atomic_dec_and_test(&req->ki_filp->f_count))) { 502 + if (unlikely(rcuref_dec_and_test(&req->ki_filp->f_count))) { 504 503 get_ioctx(ctx); 505 504 spin_lock(&fput_lock); 506 505 list_add(&req->ki_list, &fput_head);
+10 -3
fs/fcntl.c
··· 16 16 #include <linux/security.h> 17 17 #include <linux/ptrace.h> 18 18 #include <linux/signal.h> 19 + #include <linux/rcupdate.h> 19 20 20 21 #include <asm/poll.h> 21 22 #include <asm/siginfo.h> ··· 65 64 if (orig_start >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur) 66 65 goto out; 67 66 68 - fdt = files_fdtable(files); 69 67 repeat: 68 + fdt = files_fdtable(files); 70 69 /* 71 70 * Someone might have closed fd's in the range 72 71 * orig_start..fdt->next_fd ··· 96 95 if (error) 97 96 goto repeat; 98 97 98 + /* 99 + * We reacquired files_lock, so we are safe as long as 100 + * we reacquire the fdtable pointer and use it while holding 101 + * the lock, no one can free it during that time. 102 + */ 103 + fdt = files_fdtable(files); 99 104 if (start <= fdt->next_fd) 100 105 fdt->next_fd = newfd + 1; 101 - 106 + 102 107 error = newfd; 103 108 104 109 out: ··· 170 163 if (!tofree && FD_ISSET(newfd, fdt->open_fds)) 171 164 goto out_fput; 172 165 173 - fdt->fd[newfd] = file; 166 + rcu_assign_pointer(fdt->fd[newfd], file); 174 167 FD_SET(newfd, fdt->open_fds); 175 168 FD_CLR(newfd, fdt->close_on_exec); 176 169 spin_unlock(&files->file_lock);
+267 -140
fs/file.c
··· 13 13 #include <linux/vmalloc.h> 14 14 #include <linux/file.h> 15 15 #include <linux/bitops.h> 16 + #include <linux/interrupt.h> 17 + #include <linux/spinlock.h> 18 + #include <linux/rcupdate.h> 19 + #include <linux/workqueue.h> 20 + 21 + struct fdtable_defer { 22 + spinlock_t lock; 23 + struct work_struct wq; 24 + struct timer_list timer; 25 + struct fdtable *next; 26 + }; 27 + 28 + /* 29 + * We use this list to defer free fdtables that have vmalloced 30 + * sets/arrays. By keeping a per-cpu list, we avoid having to embed 31 + * the work_struct in fdtable itself which avoids a 64 byte (i386) increase in 32 + * this per-task structure. 33 + */ 34 + static DEFINE_PER_CPU(struct fdtable_defer, fdtable_defer_list); 16 35 17 36 18 37 /* ··· 67 48 vfree(array); 68 49 } 69 50 70 - /* 71 - * Expand the fd array in the files_struct. Called with the files 72 - * spinlock held for write. 73 - */ 74 - 75 - static int expand_fd_array(struct files_struct *files, int nr) 76 - __releases(files->file_lock) 77 - __acquires(files->file_lock) 51 + static void __free_fdtable(struct fdtable *fdt) 78 52 { 79 - struct file **new_fds; 80 - int error, nfds; 53 + int fdset_size, fdarray_size; 54 + 55 + fdset_size = fdt->max_fdset / 8; 56 + fdarray_size = fdt->max_fds * sizeof(struct file *); 57 + free_fdset(fdt->open_fds, fdset_size); 58 + free_fdset(fdt->close_on_exec, fdset_size); 59 + free_fd_array(fdt->fd, fdarray_size); 60 + kfree(fdt); 61 + } 62 + 63 + static void fdtable_timer(unsigned long data) 64 + { 65 + struct fdtable_defer *fddef = (struct fdtable_defer *)data; 66 + 67 + spin_lock(&fddef->lock); 68 + /* 69 + * If someone already emptied the queue return. 70 + */ 71 + if (!fddef->next) 72 + goto out; 73 + if (!schedule_work(&fddef->wq)) 74 + mod_timer(&fddef->timer, 5); 75 + out: 76 + spin_unlock(&fddef->lock); 77 + } 78 + 79 + static void free_fdtable_work(struct fdtable_defer *f) 80 + { 81 81 struct fdtable *fdt; 82 82 83 - 84 - error = -EMFILE; 85 - fdt = files_fdtable(files); 86 - if (fdt->max_fds >= NR_OPEN || nr >= NR_OPEN) 87 - goto out; 88 - 89 - nfds = fdt->max_fds; 90 - spin_unlock(&files->file_lock); 91 - 92 - /* 93 - * Expand to the max in easy steps, and keep expanding it until 94 - * we have enough for the requested fd array size. 95 - */ 96 - 97 - do { 98 - #if NR_OPEN_DEFAULT < 256 99 - if (nfds < 256) 100 - nfds = 256; 101 - else 102 - #endif 103 - if (nfds < (PAGE_SIZE / sizeof(struct file *))) 104 - nfds = PAGE_SIZE / sizeof(struct file *); 105 - else { 106 - nfds = nfds * 2; 107 - if (nfds > NR_OPEN) 108 - nfds = NR_OPEN; 109 - } 110 - } while (nfds <= nr); 111 - 112 - error = -ENOMEM; 113 - new_fds = alloc_fd_array(nfds); 114 - spin_lock(&files->file_lock); 115 - if (!new_fds) 116 - goto out; 117 - 118 - /* Copy the existing array and install the new pointer */ 119 - fdt = files_fdtable(files); 120 - 121 - if (nfds > fdt->max_fds) { 122 - struct file **old_fds; 123 - int i; 124 - 125 - old_fds = xchg(&fdt->fd, new_fds); 126 - i = xchg(&fdt->max_fds, nfds); 127 - 128 - /* Don't copy/clear the array if we are creating a new 129 - fd array for fork() */ 130 - if (i) { 131 - memcpy(new_fds, old_fds, i * sizeof(struct file *)); 132 - /* clear the remainder of the array */ 133 - memset(&new_fds[i], 0, 134 - (nfds-i) * sizeof(struct file *)); 135 - 136 - spin_unlock(&files->file_lock); 137 - free_fd_array(old_fds, i); 138 - spin_lock(&files->file_lock); 139 - } 140 - } else { 141 - /* Somebody expanded the array while we slept ... */ 142 - spin_unlock(&files->file_lock); 143 - free_fd_array(new_fds, nfds); 144 - spin_lock(&files->file_lock); 83 + spin_lock_bh(&f->lock); 84 + fdt = f->next; 85 + f->next = NULL; 86 + spin_unlock_bh(&f->lock); 87 + while(fdt) { 88 + struct fdtable *next = fdt->next; 89 + __free_fdtable(fdt); 90 + fdt = next; 145 91 } 146 - error = 0; 147 - out: 148 - return error; 92 + } 93 + 94 + static void free_fdtable_rcu(struct rcu_head *rcu) 95 + { 96 + struct fdtable *fdt = container_of(rcu, struct fdtable, rcu); 97 + int fdset_size, fdarray_size; 98 + struct fdtable_defer *fddef; 99 + 100 + BUG_ON(!fdt); 101 + fdset_size = fdt->max_fdset / 8; 102 + fdarray_size = fdt->max_fds * sizeof(struct file *); 103 + 104 + if (fdt->free_files) { 105 + /* 106 + * The this fdtable was embedded in the files structure 107 + * and the files structure itself was getting destroyed. 108 + * It is now safe to free the files structure. 109 + */ 110 + kmem_cache_free(files_cachep, fdt->free_files); 111 + return; 112 + } 113 + if (fdt->max_fdset <= __FD_SETSIZE && fdt->max_fds <= NR_OPEN_DEFAULT) { 114 + /* 115 + * The fdtable was embedded 116 + */ 117 + return; 118 + } 119 + if (fdset_size <= PAGE_SIZE && fdarray_size <= PAGE_SIZE) { 120 + kfree(fdt->open_fds); 121 + kfree(fdt->close_on_exec); 122 + kfree(fdt->fd); 123 + kfree(fdt); 124 + } else { 125 + fddef = &get_cpu_var(fdtable_defer_list); 126 + spin_lock(&fddef->lock); 127 + fdt->next = fddef->next; 128 + fddef->next = fdt; 129 + /* 130 + * vmallocs are handled from the workqueue context. 131 + * If the per-cpu workqueue is running, then we 132 + * defer work scheduling through a timer. 133 + */ 134 + if (!schedule_work(&fddef->wq)) 135 + mod_timer(&fddef->timer, 5); 136 + spin_unlock(&fddef->lock); 137 + put_cpu_var(fdtable_defer_list); 138 + } 139 + } 140 + 141 + void free_fdtable(struct fdtable *fdt) 142 + { 143 + if (fdt->free_files || fdt->max_fdset > __FD_SETSIZE || 144 + fdt->max_fds > NR_OPEN_DEFAULT) 145 + call_rcu(&fdt->rcu, free_fdtable_rcu); 146 + } 147 + 148 + /* 149 + * Expand the fdset in the files_struct. Called with the files spinlock 150 + * held for write. 151 + */ 152 + static void copy_fdtable(struct fdtable *nfdt, struct fdtable *fdt) 153 + { 154 + int i; 155 + int count; 156 + 157 + BUG_ON(nfdt->max_fdset < fdt->max_fdset); 158 + BUG_ON(nfdt->max_fds < fdt->max_fds); 159 + /* Copy the existing tables and install the new pointers */ 160 + 161 + i = fdt->max_fdset / (sizeof(unsigned long) * 8); 162 + count = (nfdt->max_fdset - fdt->max_fdset) / 8; 163 + 164 + /* 165 + * Don't copy the entire array if the current fdset is 166 + * not yet initialised. 167 + */ 168 + if (i) { 169 + memcpy (nfdt->open_fds, fdt->open_fds, 170 + fdt->max_fdset/8); 171 + memcpy (nfdt->close_on_exec, fdt->close_on_exec, 172 + fdt->max_fdset/8); 173 + memset (&nfdt->open_fds->fds_bits[i], 0, count); 174 + memset (&nfdt->close_on_exec->fds_bits[i], 0, count); 175 + } 176 + 177 + /* Don't copy/clear the array if we are creating a new 178 + fd array for fork() */ 179 + if (fdt->max_fds) { 180 + memcpy(nfdt->fd, fdt->fd, 181 + fdt->max_fds * sizeof(struct file *)); 182 + /* clear the remainder of the array */ 183 + memset(&nfdt->fd[fdt->max_fds], 0, 184 + (nfdt->max_fds - fdt->max_fds) * 185 + sizeof(struct file *)); 186 + } 187 + nfdt->next_fd = fdt->next_fd; 149 188 } 150 189 151 190 /* ··· 234 157 vfree(array); 235 158 } 236 159 237 - /* 238 - * Expand the fdset in the files_struct. Called with the files spinlock 239 - * held for write. 240 - */ 241 - static int expand_fdset(struct files_struct *files, int nr) 242 - __releases(file->file_lock) 243 - __acquires(file->file_lock) 160 + static struct fdtable *alloc_fdtable(int nr) 244 161 { 245 - fd_set *new_openset = NULL, *new_execset = NULL; 246 - int error, nfds = 0; 247 - struct fdtable *fdt; 162 + struct fdtable *fdt = NULL; 163 + int nfds = 0; 164 + fd_set *new_openset = NULL, *new_execset = NULL; 165 + struct file **new_fds; 248 166 249 - error = -EMFILE; 250 - fdt = files_fdtable(files); 251 - if (fdt->max_fdset >= NR_OPEN || nr >= NR_OPEN) 252 - goto out; 167 + fdt = kmalloc(sizeof(*fdt), GFP_KERNEL); 168 + if (!fdt) 169 + goto out; 170 + memset(fdt, 0, sizeof(*fdt)); 253 171 254 - nfds = fdt->max_fdset; 255 - spin_unlock(&files->file_lock); 256 - 257 - /* Expand to the max in easy steps */ 258 - do { 172 + nfds = __FD_SETSIZE; 173 + /* Expand to the max in easy steps */ 174 + do { 259 175 if (nfds < (PAGE_SIZE * 8)) 260 176 nfds = PAGE_SIZE * 8; 261 177 else { ··· 258 188 } 259 189 } while (nfds <= nr); 260 190 261 - error = -ENOMEM; 262 - new_openset = alloc_fdset(nfds); 263 - new_execset = alloc_fdset(nfds); 264 - spin_lock(&files->file_lock); 265 - if (!new_openset || !new_execset) 191 + new_openset = alloc_fdset(nfds); 192 + new_execset = alloc_fdset(nfds); 193 + if (!new_openset || !new_execset) 194 + goto out; 195 + fdt->open_fds = new_openset; 196 + fdt->close_on_exec = new_execset; 197 + fdt->max_fdset = nfds; 198 + 199 + nfds = NR_OPEN_DEFAULT; 200 + /* 201 + * Expand to the max in easy steps, and keep expanding it until 202 + * we have enough for the requested fd array size. 203 + */ 204 + do { 205 + #if NR_OPEN_DEFAULT < 256 206 + if (nfds < 256) 207 + nfds = 256; 208 + else 209 + #endif 210 + if (nfds < (PAGE_SIZE / sizeof(struct file *))) 211 + nfds = PAGE_SIZE / sizeof(struct file *); 212 + else { 213 + nfds = nfds * 2; 214 + if (nfds > NR_OPEN) 215 + nfds = NR_OPEN; 216 + } 217 + } while (nfds <= nr); 218 + new_fds = alloc_fd_array(nfds); 219 + if (!new_fds) 266 220 goto out; 267 - 268 - error = 0; 269 - 270 - /* Copy the existing tables and install the new pointers */ 271 - fdt = files_fdtable(files); 272 - if (nfds > fdt->max_fdset) { 273 - int i = fdt->max_fdset / (sizeof(unsigned long) * 8); 274 - int count = (nfds - fdt->max_fdset) / 8; 275 - 276 - /* 277 - * Don't copy the entire array if the current fdset is 278 - * not yet initialised. 279 - */ 280 - if (i) { 281 - memcpy (new_openset, fdt->open_fds, fdt->max_fdset/8); 282 - memcpy (new_execset, fdt->close_on_exec, fdt->max_fdset/8); 283 - memset (&new_openset->fds_bits[i], 0, count); 284 - memset (&new_execset->fds_bits[i], 0, count); 285 - } 286 - 287 - nfds = xchg(&fdt->max_fdset, nfds); 288 - new_openset = xchg(&fdt->open_fds, new_openset); 289 - new_execset = xchg(&fdt->close_on_exec, new_execset); 290 - spin_unlock(&files->file_lock); 291 - free_fdset (new_openset, nfds); 292 - free_fdset (new_execset, nfds); 293 - spin_lock(&files->file_lock); 294 - return 0; 295 - } 296 - /* Somebody expanded the array while we slept ... */ 297 - 221 + fdt->fd = new_fds; 222 + fdt->max_fds = nfds; 223 + fdt->free_files = NULL; 224 + return fdt; 298 225 out: 226 + if (new_openset) 227 + free_fdset(new_openset, nfds); 228 + if (new_execset) 229 + free_fdset(new_execset, nfds); 230 + kfree(fdt); 231 + return NULL; 232 + } 233 + 234 + /* 235 + * Expands the file descriptor table - it will allocate a new fdtable and 236 + * both fd array and fdset. It is expected to be called with the 237 + * files_lock held. 238 + */ 239 + static int expand_fdtable(struct files_struct *files, int nr) 240 + __releases(files->file_lock) 241 + __acquires(files->file_lock) 242 + { 243 + int error = 0; 244 + struct fdtable *fdt; 245 + struct fdtable *nfdt = NULL; 246 + 299 247 spin_unlock(&files->file_lock); 300 - if (new_openset) 301 - free_fdset(new_openset, nfds); 302 - if (new_execset) 303 - free_fdset(new_execset, nfds); 248 + nfdt = alloc_fdtable(nr); 249 + if (!nfdt) { 250 + error = -ENOMEM; 251 + spin_lock(&files->file_lock); 252 + goto out; 253 + } 254 + 304 255 spin_lock(&files->file_lock); 256 + fdt = files_fdtable(files); 257 + /* 258 + * Check again since another task may have expanded the 259 + * fd table while we dropped the lock 260 + */ 261 + if (nr >= fdt->max_fds || nr >= fdt->max_fdset) { 262 + copy_fdtable(nfdt, fdt); 263 + } else { 264 + /* Somebody expanded while we dropped file_lock */ 265 + spin_unlock(&files->file_lock); 266 + __free_fdtable(nfdt); 267 + spin_lock(&files->file_lock); 268 + goto out; 269 + } 270 + rcu_assign_pointer(files->fdt, nfdt); 271 + free_fdtable(fdt); 272 + out: 305 273 return error; 306 274 } 307 275 ··· 354 246 struct fdtable *fdt; 355 247 356 248 fdt = files_fdtable(files); 357 - if (nr >= fdt->max_fdset) { 358 - expand = 1; 359 - if ((err = expand_fdset(files, nr))) 249 + if (nr >= fdt->max_fdset || nr >= fdt->max_fds) { 250 + if (fdt->max_fdset >= NR_OPEN || 251 + fdt->max_fds >= NR_OPEN || nr >= NR_OPEN) { 252 + err = -EMFILE; 360 253 goto out; 361 - } 362 - if (nr >= fdt->max_fds) { 254 + } 363 255 expand = 1; 364 - if ((err = expand_fd_array(files, nr))) 256 + if ((err = expand_fdtable(files, nr))) 365 257 goto out; 366 258 } 367 259 err = expand; 368 260 out: 369 261 return err; 262 + } 263 + 264 + static void __devinit fdtable_defer_list_init(int cpu) 265 + { 266 + struct fdtable_defer *fddef = &per_cpu(fdtable_defer_list, cpu); 267 + spin_lock_init(&fddef->lock); 268 + INIT_WORK(&fddef->wq, (void (*)(void *))free_fdtable_work, fddef); 269 + init_timer(&fddef->timer); 270 + fddef->timer.data = (unsigned long)fddef; 271 + fddef->timer.function = fdtable_timer; 272 + fddef->next = NULL; 273 + } 274 + 275 + void __init files_defer_init(void) 276 + { 277 + int i; 278 + /* Really early - can't use for_each_cpu */ 279 + for (i = 0; i < NR_CPUS; i++) 280 + fdtable_defer_list_init(i); 370 281 }
+29 -11
fs/file_table.c
··· 14 14 #include <linux/fs.h> 15 15 #include <linux/security.h> 16 16 #include <linux/eventpoll.h> 17 + #include <linux/rcupdate.h> 17 18 #include <linux/mount.h> 18 19 #include <linux/cdev.h> 19 20 #include <linux/fsnotify.h> ··· 54 53 spin_unlock_irqrestore(&filp_count_lock, flags); 55 54 } 56 55 56 + static inline void file_free_rcu(struct rcu_head *head) 57 + { 58 + struct file *f = container_of(head, struct file, f_rcuhead); 59 + kmem_cache_free(filp_cachep, f); 60 + } 61 + 57 62 static inline void file_free(struct file *f) 58 63 { 59 - kmem_cache_free(filp_cachep, f); 64 + call_rcu(&f->f_rcuhead, file_free_rcu); 60 65 } 61 66 62 67 /* Find an unused file structure and return a pointer to it. ··· 117 110 118 111 void fastcall fput(struct file *file) 119 112 { 120 - if (atomic_dec_and_test(&file->f_count)) 113 + if (rcuref_dec_and_test(&file->f_count)) 121 114 __fput(file); 122 115 } 123 116 ··· 163 156 struct file *file; 164 157 struct files_struct *files = current->files; 165 158 166 - spin_lock(&files->file_lock); 159 + rcu_read_lock(); 167 160 file = fcheck_files(files, fd); 168 - if (file) 169 - get_file(file); 170 - spin_unlock(&files->file_lock); 161 + if (file) { 162 + if (!rcuref_inc_lf(&file->f_count)) { 163 + /* File object ref couldn't be taken */ 164 + rcu_read_unlock(); 165 + return NULL; 166 + } 167 + } 168 + rcu_read_unlock(); 169 + 171 170 return file; 172 171 } 173 172 ··· 195 182 if (likely((atomic_read(&files->count) == 1))) { 196 183 file = fcheck_files(files, fd); 197 184 } else { 198 - spin_lock(&files->file_lock); 185 + rcu_read_lock(); 199 186 file = fcheck_files(files, fd); 200 187 if (file) { 201 - get_file(file); 202 - *fput_needed = 1; 188 + if (rcuref_inc_lf(&file->f_count)) 189 + *fput_needed = 1; 190 + else 191 + /* Didn't get the reference, someone's freed */ 192 + file = NULL; 203 193 } 204 - spin_unlock(&files->file_lock); 194 + rcu_read_unlock(); 205 195 } 196 + 206 197 return file; 207 198 } 208 199 209 200 210 201 void put_filp(struct file *file) 211 202 { 212 - if (atomic_dec_and_test(&file->f_count)) { 203 + if (rcuref_dec_and_test(&file->f_count)) { 213 204 security_file_free(file); 214 205 file_kill(file); 215 206 file_free(file); ··· 274 257 files_stat.max_files = n; 275 258 if (files_stat.max_files < NR_FILE) 276 259 files_stat.max_files = NR_FILE; 260 + files_defer_init(); 277 261 }
+4 -4
fs/open.c
··· 24 24 #include <linux/personality.h> 25 25 #include <linux/pagemap.h> 26 26 #include <linux/syscalls.h> 27 + #include <linux/rcupdate.h> 27 28 28 29 #include <asm/unistd.h> 29 30 ··· 931 930 struct fdtable *fdt; 932 931 spin_lock(&files->file_lock); 933 932 fdt = files_fdtable(files); 934 - if (unlikely(fdt->fd[fd] != NULL)) 935 - BUG(); 936 - fdt->fd[fd] = file; 933 + BUG_ON(fdt->fd[fd] != NULL); 934 + rcu_assign_pointer(fdt->fd[fd], file); 937 935 spin_unlock(&files->file_lock); 938 936 } 939 937 ··· 1024 1024 filp = fdt->fd[fd]; 1025 1025 if (!filp) 1026 1026 goto out_unlock; 1027 - fdt->fd[fd] = NULL; 1027 + rcu_assign_pointer(fdt->fd[fd], NULL); 1028 1028 FD_CLR(fd, fdt->close_on_exec); 1029 1029 __put_unused_fd(files, fd); 1030 1030 spin_unlock(&files->file_lock);
+9 -2
include/linux/file.h
··· 9 9 #include <linux/posix_types.h> 10 10 #include <linux/compiler.h> 11 11 #include <linux/spinlock.h> 12 + #include <linux/rcupdate.h> 12 13 13 14 /* 14 15 * The default fd array needs to be at least BITS_PER_LONG, ··· 24 23 struct file ** fd; /* current fd array */ 25 24 fd_set *close_on_exec; 26 25 fd_set *open_fds; 26 + struct rcu_head rcu; 27 + struct files_struct *free_files; 28 + struct fdtable *next; 27 29 }; 28 30 29 31 /* ··· 35 31 struct files_struct { 36 32 atomic_t count; 37 33 spinlock_t file_lock; /* Protects all the below members. Nests inside tsk->alloc_lock */ 34 + struct fdtable *fdt; 38 35 struct fdtable fdtab; 39 36 fd_set close_on_exec_init; 40 37 fd_set open_fds_init; 41 38 struct file * fd_array[NR_OPEN_DEFAULT]; 42 39 }; 43 40 44 - #define files_fdtable(files) (&(files)->fdtab) 41 + #define files_fdtable(files) (rcu_dereference((files)->fdt)) 45 42 46 43 extern void FASTCALL(__fput(struct file *)); 47 44 extern void FASTCALL(fput(struct file *)); ··· 70 65 extern void free_fdset(fd_set *, int); 71 66 72 67 extern int expand_files(struct files_struct *, int nr); 68 + extern void free_fdtable(struct fdtable *fdt); 69 + extern void __init files_defer_init(void); 73 70 74 71 static inline struct file * fcheck_files(struct files_struct *files, unsigned int fd) 75 72 { ··· 79 72 struct fdtable *fdt = files_fdtable(files); 80 73 81 74 if (fd < fdt->max_fds) 82 - file = fdt->fd[fd]; 75 + file = rcu_dereference(fdt->fd[fd]); 83 76 return file; 84 77 } 85 78
+3 -1
include/linux/fs.h
··· 9 9 #include <linux/config.h> 10 10 #include <linux/limits.h> 11 11 #include <linux/ioctl.h> 12 + #include <linux/rcuref.h> 12 13 13 14 /* 14 15 * It's silly to have NR_OPEN bigger than NR_FILE, but you can change ··· 598 597 spinlock_t f_ep_lock; 599 598 #endif /* #ifdef CONFIG_EPOLL */ 600 599 struct address_space *f_mapping; 600 + struct rcu_head f_rcuhead; 601 601 }; 602 602 extern spinlock_t files_lock; 603 603 #define file_list_lock() spin_lock(&files_lock); 604 604 #define file_list_unlock() spin_unlock(&files_lock); 605 605 606 - #define get_file(x) atomic_inc(&(x)->f_count) 606 + #define get_file(x) rcuref_inc(&(x)->f_count) 607 607 #define file_count(x) atomic_read(&(x)->f_count) 608 608 609 609 #define MAX_NON_LFS ((1UL<<31) - 1)
+5
include/linux/init_task.h
··· 2 2 #define _LINUX__INIT_TASK_H 3 3 4 4 #include <linux/file.h> 5 + #include <linux/rcupdate.h> 5 6 6 7 #define INIT_FDTABLE \ 7 8 { \ ··· 12 11 .fd = &init_files.fd_array[0], \ 13 12 .close_on_exec = &init_files.close_on_exec_init, \ 14 13 .open_fds = &init_files.open_fds_init, \ 14 + .rcu = RCU_HEAD_INIT, \ 15 + .free_files = NULL, \ 16 + .next = NULL, \ 15 17 } 16 18 17 19 #define INIT_FILES \ 18 20 { \ 19 21 .count = ATOMIC_INIT(1), \ 20 22 .file_lock = SPIN_LOCK_UNLOCKED, \ 23 + .fdt = &init_files.fdtab, \ 21 24 .fdtab = INIT_FDTABLE, \ 22 25 .close_on_exec_init = { { 0, } }, \ 23 26 .open_fds_init = { { 0, } }, \
+8 -7
kernel/exit.c
··· 411 411 close_files(files); 412 412 /* 413 413 * Free the fd and fdset arrays if we expanded them. 414 + * If the fdtable was embedded, pass files for freeing 415 + * at the end of the RCU grace period. Otherwise, 416 + * you can free files immediately. 414 417 */ 415 418 fdt = files_fdtable(files); 416 - if (fdt->fd != &files->fd_array[0]) 417 - free_fd_array(fdt->fd, fdt->max_fds); 418 - if (fdt->max_fdset > __FD_SETSIZE) { 419 - free_fdset(fdt->open_fds, fdt->max_fdset); 420 - free_fdset(fdt->close_on_exec, fdt->max_fdset); 421 - } 422 - kmem_cache_free(files_cachep, files); 419 + if (fdt == &files->fdtab) 420 + fdt->free_files = files; 421 + else 422 + kmem_cache_free(files_cachep, files); 423 + free_fdtable(fdt); 423 424 } 424 425 } 425 426
+17 -6
kernel/fork.c
··· 35 35 #include <linux/syscalls.h> 36 36 #include <linux/jiffies.h> 37 37 #include <linux/futex.h> 38 + #include <linux/rcupdate.h> 38 39 #include <linux/ptrace.h> 39 40 #include <linux/mount.h> 40 41 #include <linux/audit.h> ··· 566 565 return 0; 567 566 } 568 567 569 - static int count_open_files(struct files_struct *files, int size) 568 + static int count_open_files(struct fdtable *fdt) 570 569 { 570 + int size = fdt->max_fdset; 571 571 int i; 572 - struct fdtable *fdt; 573 572 574 573 /* Find the last open fd */ 575 - fdt = files_fdtable(files); 576 574 for (i = size/(8*sizeof(long)); i > 0; ) { 577 575 if (fdt->open_fds->fds_bits[--i]) 578 576 break; ··· 592 592 atomic_set(&newf->count, 1); 593 593 594 594 spin_lock_init(&newf->file_lock); 595 - fdt = files_fdtable(newf); 595 + fdt = &newf->fdtab; 596 596 fdt->next_fd = 0; 597 597 fdt->max_fds = NR_OPEN_DEFAULT; 598 598 fdt->max_fdset = __FD_SETSIZE; 599 599 fdt->close_on_exec = &newf->close_on_exec_init; 600 600 fdt->open_fds = &newf->open_fds_init; 601 601 fdt->fd = &newf->fd_array[0]; 602 + INIT_RCU_HEAD(&fdt->rcu); 603 + fdt->free_files = NULL; 604 + fdt->next = NULL; 605 + rcu_assign_pointer(newf->fdt, fdt); 602 606 out: 603 607 return newf; 604 608 } ··· 641 637 old_fdt = files_fdtable(oldf); 642 638 new_fdt = files_fdtable(newf); 643 639 size = old_fdt->max_fdset; 644 - open_files = count_open_files(oldf, old_fdt->max_fdset); 640 + open_files = count_open_files(old_fdt); 645 641 expand = 0; 646 642 647 643 /* ··· 665 661 spin_unlock(&newf->file_lock); 666 662 if (error < 0) 667 663 goto out_release; 664 + new_fdt = files_fdtable(newf); 665 + /* 666 + * Reacquire the oldf lock and a pointer to its fd table 667 + * who knows it may have a new bigger fd table. We need 668 + * the latest pointer. 669 + */ 668 670 spin_lock(&oldf->file_lock); 671 + old_fdt = files_fdtable(oldf); 669 672 } 670 673 671 674 old_fds = old_fdt->fd; ··· 694 683 */ 695 684 FD_CLR(open_files - i, new_fdt->open_fds); 696 685 } 697 - *new_fds++ = f; 686 + rcu_assign_pointer(*new_fds++, f); 698 687 } 699 688 spin_unlock(&oldf->file_lock); 700 689