Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

get rid of NR_OPEN and introduce a sysctl_nr_open

NR_OPEN (historically set to 1024*1024) actually forbids processes to open
more than 1024*1024 handles.

Unfortunatly some production servers hit the not so 'ridiculously high
value' of 1024*1024 file descriptors per process.

Changing NR_OPEN is not considered safe because of vmalloc space potential
exhaust.

This patch introduces a new sysctl (/proc/sys/fs/nr_open) wich defaults to
1024*1024, so that admins can decide to change this limit if their workload
needs it.

[akpm@linux-foundation.org: export it for sparc64]
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Ralf Baechle <ralf@linux-mips.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Eric Dumazet and committed by
Linus Torvalds
9cfe015a 774ed22c

+41 -10
+8
Documentation/filesystems/proc.txt
··· 1029 1029 Denotes the number of inodes the system has allocated. This number will 1030 1030 grow and shrink dynamically. 1031 1031 1032 + nr_open 1033 + ------- 1034 + 1035 + Denotes the maximum number of file-handles a process can 1036 + allocate. Default value is 1024*1024 (1048576) which should be 1037 + enough for most machines. Actual limit depends on RLIMIT_NOFILE 1038 + resource limit. 1039 + 1032 1040 nr_free_inodes 1033 1041 -------------- 1034 1042
+10
Documentation/sysctl/fs.txt
··· 23 23 - inode-max 24 24 - inode-nr 25 25 - inode-state 26 + - nr_open 26 27 - overflowuid 27 28 - overflowgid 28 29 - suid_dumpable ··· 89 88 close to the maximum, but the number of unused file handles is 90 89 significantly greater than 0, you've encountered a peak in your 91 90 usage of file handles and you don't need to increase the maximum. 91 + 92 + ============================================================== 93 + 94 + nr_open: 95 + 96 + This denotes the maximum number of file-handles a process can 97 + allocate. Default value is 1024*1024 (1048576) which should be 98 + enough for most machines. Actual limit depends on RLIMIT_NOFILE 99 + resource limit. 92 100 93 101 ============================================================== 94 102
+1 -1
arch/alpha/kernel/osf_sys.c
··· 430 430 asmlinkage unsigned long 431 431 sys_getdtablesize(void) 432 432 { 433 - return NR_OPEN; 433 + return sysctl_nr_open; 434 434 } 435 435 436 436 /*
+1 -1
arch/mips/kernel/sysirix.c
··· 356 356 retval = NGROUPS_MAX; 357 357 goto out; 358 358 case 5: 359 - retval = NR_OPEN; 359 + retval = sysctl_nr_open; 360 360 goto out; 361 361 case 6: 362 362 retval = 1;
+1
arch/sparc64/kernel/sparc64_ksyms.c
··· 277 277 EXPORT_SYMBOL(sys_geteuid); 278 278 EXPORT_SYMBOL(sys_getuid); 279 279 EXPORT_SYMBOL(sys_getegid); 280 + EXPORT_SYMBOL(sysctl_nr_open); 280 281 EXPORT_SYMBOL(sys_getgid); 281 282 EXPORT_SYMBOL(svr4_getcontext); 282 283 EXPORT_SYMBOL(svr4_setcontext);
+1 -1
arch/sparc64/solaris/fs.c
··· 624 624 case 3: /* UL_GMEMLIM */ 625 625 return current->signal->rlim[RLIMIT_DATA].rlim_cur; 626 626 case 4: /* UL_GDESLIM */ 627 - return NR_OPEN; 627 + return sysctl_nr_open; 628 628 } 629 629 return -EINVAL; 630 630 }
+4 -2
arch/sparc64/solaris/timod.c
··· 859 859 860 860 SOLD("entry"); 861 861 lock_kernel(); 862 - if(fd >= NR_OPEN) goto out; 862 + if (fd >= sysctl_nr_open) 863 + goto out; 863 864 864 865 fdt = files_fdtable(current->files); 865 866 filp = fdt->fd[fd]; ··· 928 927 929 928 SOLD("entry"); 930 929 lock_kernel(); 931 - if(fd >= NR_OPEN) goto out; 930 + if (fd >= sysctl_nr_open) 931 + goto out; 932 932 933 933 fdt = files_fdtable(current->files); 934 934 filp = fdt->fd[fd];
+5 -3
fs/file.c
··· 24 24 struct fdtable *next; 25 25 }; 26 26 27 + int sysctl_nr_open __read_mostly = 1024*1024; 28 + 27 29 /* 28 30 * We use this list to defer free fdtables that have vmalloced 29 31 * sets/arrays. By keeping a per-cpu list, we avoid having to embed ··· 149 147 nr /= (1024 / sizeof(struct file *)); 150 148 nr = roundup_pow_of_two(nr + 1); 151 149 nr *= (1024 / sizeof(struct file *)); 152 - if (nr > NR_OPEN) 153 - nr = NR_OPEN; 150 + if (nr > sysctl_nr_open) 151 + nr = sysctl_nr_open; 154 152 155 153 fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL); 156 154 if (!fdt) ··· 235 233 if (nr < fdt->max_fds) 236 234 return 0; 237 235 /* Can we expand? */ 238 - if (nr >= NR_OPEN) 236 + if (nr >= sysctl_nr_open) 239 237 return -EMFILE; 240 238 241 239 /* All good, so we try */
+1 -1
include/linux/fs.h
··· 21 21 22 22 /* Fixed constants first: */ 23 23 #undef NR_OPEN 24 - #define NR_OPEN (1024*1024) /* Absolute upper limit on fd num */ 24 + extern int sysctl_nr_open; 25 25 #define INR_OPEN 1024 /* Initial setting for nfile rlimits */ 26 26 27 27 #define BLOCK_SIZE_BITS 10
+1 -1
kernel/sys.c
··· 1472 1472 if ((new_rlim.rlim_max > old_rlim->rlim_max) && 1473 1473 !capable(CAP_SYS_RESOURCE)) 1474 1474 return -EPERM; 1475 - if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > NR_OPEN) 1475 + if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > sysctl_nr_open) 1476 1476 return -EPERM; 1477 1477 1478 1478 retval = security_task_setrlimit(resource, &new_rlim);
+8
kernel/sysctl.c
··· 1203 1203 .proc_handler = &proc_dointvec, 1204 1204 }, 1205 1205 { 1206 + .ctl_name = CTL_UNNUMBERED, 1207 + .procname = "nr_open", 1208 + .data = &sysctl_nr_open, 1209 + .maxlen = sizeof(int), 1210 + .mode = 0644, 1211 + .proc_handler = &proc_dointvec, 1212 + }, 1213 + { 1206 1214 .ctl_name = FS_DENTRY, 1207 1215 .procname = "dentry-state", 1208 1216 .data = &dentry_stat,