Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

fs: allow for more than 2^31 files

Andrew,

Could you please review this patch, you probably are the right guy to
take it, because it crosses fs and net trees.

Note : /proc/sys/fs/file-nr is a read-only file, so this patch doesnt
depend on previous patch (sysctl: fix min/max handling in
__do_proc_doulongvec_minmax())

Thanks !

[PATCH V4] fs: allow for more than 2^31 files

Robin Holt tried to boot a 16TB system and found af_unix was overflowing
a 32bit value :

<quote>

We were seeing a failure which prevented boot. The kernel was incapable
of creating either a named pipe or unix domain socket. This comes down
to a common kernel function called unix_create1() which does:

atomic_inc(&unix_nr_socks);
if (atomic_read(&unix_nr_socks) > 2 * get_max_files())
goto out;

The function get_max_files() is a simple return of files_stat.max_files.
files_stat.max_files is a signed integer and is computed in
fs/file_table.c's files_init().

n = (mempages * (PAGE_SIZE / 1024)) / 10;
files_stat.max_files = n;

In our case, mempages (total_ram_pages) is approx 3,758,096,384
(0xe0000000). That leaves max_files at approximately 1,503,238,553.
This causes 2 * get_max_files() to integer overflow.

</quote>

Fix is to let /proc/sys/fs/file-nr & /proc/sys/fs/file-max use long
integers, and change af_unix to use an atomic_long_t instead of
atomic_t.

get_max_files() is changed to return an unsigned long.
get_nr_files() is changed to return a long.

unix_nr_socks is changed from atomic_t to atomic_long_t, while not
strictly needed to address Robin problem.

Before patch (on a 64bit kernel) :
# echo 2147483648 >/proc/sys/fs/file-max
# cat /proc/sys/fs/file-max
-18446744071562067968

After patch:
# echo 2147483648 >/proc/sys/fs/file-max
# cat /proc/sys/fs/file-max
2147483648
# cat /proc/sys/fs/file-nr
704 0 2147483648

Reported-by: Robin Holt <holt@sgi.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Acked-by: David Miller <davem@davemloft.net>
Reviewed-by: Robin Holt <holt@sgi.com>
Tested-by: Robin Holt <holt@sgi.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

authored by

Eric Dumazet and committed by
Al Viro
7e360c38 fde214d4

+21 -24
+7 -10
fs/file_table.c
··· 60 60 /* 61 61 * Return the total number of open files in the system 62 62 */ 63 - static int get_nr_files(void) 63 + static long get_nr_files(void) 64 64 { 65 65 return percpu_counter_read_positive(&nr_files); 66 66 } ··· 68 68 /* 69 69 * Return the maximum number of open files in the system 70 70 */ 71 - int get_max_files(void) 71 + unsigned long get_max_files(void) 72 72 { 73 73 return files_stat.max_files; 74 74 } ··· 82 82 void __user *buffer, size_t *lenp, loff_t *ppos) 83 83 { 84 84 files_stat.nr_files = get_nr_files(); 85 - return proc_dointvec(table, write, buffer, lenp, ppos); 85 + return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); 86 86 } 87 87 #else 88 88 int proc_nr_files(ctl_table *table, int write, ··· 105 105 struct file *get_empty_filp(void) 106 106 { 107 107 const struct cred *cred = current_cred(); 108 - static int old_max; 108 + static long old_max; 109 109 struct file * f; 110 110 111 111 /* ··· 140 140 over: 141 141 /* Ran out of filps - report that */ 142 142 if (get_nr_files() > old_max) { 143 - printk(KERN_INFO "VFS: file-max limit %d reached\n", 144 - get_max_files()); 143 + pr_info("VFS: file-max limit %lu reached\n", get_max_files()); 145 144 old_max = get_nr_files(); 146 145 } 147 146 goto fail; ··· 486 487 487 488 void __init files_init(unsigned long mempages) 488 489 { 489 - int n; 490 + unsigned long n; 490 491 491 492 filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, 492 493 SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); ··· 497 498 */ 498 499 499 500 n = (mempages * (PAGE_SIZE / 1024)) / 10; 500 - files_stat.max_files = n; 501 - if (files_stat.max_files < NR_FILE) 502 - files_stat.max_files = NR_FILE; 501 + files_stat.max_files = max_t(unsigned long, n, NR_FILE); 503 502 files_defer_init(); 504 503 lg_lock_init(files_lglock); 505 504 percpu_counter_init(&nr_files, 0);
+4 -4
include/linux/fs.h
··· 34 34 35 35 /* And dynamically-tunable limits and defaults: */ 36 36 struct files_stat_struct { 37 - int nr_files; /* read only */ 38 - int nr_free_files; /* read only */ 39 - int max_files; /* tunable */ 37 + unsigned long nr_files; /* read only */ 38 + unsigned long nr_free_files; /* read only */ 39 + unsigned long max_files; /* tunable */ 40 40 }; 41 41 42 42 struct inodes_stat_t { ··· 400 400 extern void __init files_init(unsigned long); 401 401 402 402 extern struct files_stat_struct files_stat; 403 - extern int get_max_files(void); 403 + extern unsigned long get_max_files(void); 404 404 extern int sysctl_nr_open; 405 405 extern struct inodes_stat_t inodes_stat; 406 406 extern int leases_enable, lease_break_time;
+3 -3
kernel/sysctl.c
··· 1352 1352 { 1353 1353 .procname = "file-nr", 1354 1354 .data = &files_stat, 1355 - .maxlen = 3*sizeof(int), 1355 + .maxlen = sizeof(files_stat), 1356 1356 .mode = 0444, 1357 1357 .proc_handler = proc_nr_files, 1358 1358 }, 1359 1359 { 1360 1360 .procname = "file-max", 1361 1361 .data = &files_stat.max_files, 1362 - .maxlen = sizeof(int), 1362 + .maxlen = sizeof(files_stat.max_files), 1363 1363 .mode = 0644, 1364 - .proc_handler = proc_dointvec, 1364 + .proc_handler = proc_doulongvec_minmax, 1365 1365 }, 1366 1366 { 1367 1367 .procname = "nr_open",
+7 -7
net/unix/af_unix.c
··· 117 117 118 118 static struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1]; 119 119 static DEFINE_SPINLOCK(unix_table_lock); 120 - static atomic_t unix_nr_socks = ATOMIC_INIT(0); 120 + static atomic_long_t unix_nr_socks; 121 121 122 122 #define unix_sockets_unbound (&unix_socket_table[UNIX_HASH_SIZE]) 123 123 ··· 360 360 if (u->addr) 361 361 unix_release_addr(u->addr); 362 362 363 - atomic_dec(&unix_nr_socks); 363 + atomic_long_dec(&unix_nr_socks); 364 364 local_bh_disable(); 365 365 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 366 366 local_bh_enable(); 367 367 #ifdef UNIX_REFCNT_DEBUG 368 - printk(KERN_DEBUG "UNIX %p is destroyed, %d are still alive.\n", sk, 369 - atomic_read(&unix_nr_socks)); 368 + printk(KERN_DEBUG "UNIX %p is destroyed, %ld are still alive.\n", sk, 369 + atomic_long_read(&unix_nr_socks)); 370 370 #endif 371 371 } 372 372 ··· 606 606 struct sock *sk = NULL; 607 607 struct unix_sock *u; 608 608 609 - atomic_inc(&unix_nr_socks); 610 - if (atomic_read(&unix_nr_socks) > 2 * get_max_files()) 609 + atomic_long_inc(&unix_nr_socks); 610 + if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) 611 611 goto out; 612 612 613 613 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto); ··· 632 632 unix_insert_socket(unix_sockets_unbound, sk); 633 633 out: 634 634 if (sk == NULL) 635 - atomic_dec(&unix_nr_socks); 635 + atomic_long_dec(&unix_nr_socks); 636 636 else { 637 637 local_bh_disable(); 638 638 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);