epoll: introduce resource usage limits

It has been thought that the per-user file descriptors limit would also
limit the resources that a normal user can request via the epoll
interface. Vegard Nossum reported a very simple program (a modified
version attached) that can make a normal user to request a pretty large
amount of kernel memory, well within the its maximum number of fds. To
solve such problem, default limits are now imposed, and /proc based
configuration has been introduced. A new directory has been created,
named /proc/sys/fs/epoll/ and inside there, there are two configuration
points:

max_user_instances = Maximum number of devices - per user

max_user_watches = Maximum number of "watched" fds - per user

The current default for "max_user_watches" limits the memory used by epoll
to store "watches", to 1/32 of the amount of the low RAM. As example, a
256MB 32bit machine, will have "max_user_watches" set to roughly 90000.
That should be enough to not break existing heavy epoll users. The
default value for "max_user_instances" is set to 128, that should be
enough too.

This also changes the userspace, because a new error code can now come out
from EPOLL_CTL_ADD (-ENOSPC). The EMFILE from epoll_create() was already
listed, so that should be ok.

[akpm@linux-foundation.org: use get_current_user()]
Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: <stable@kernel.org>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Reported-by: Vegard Nossum <vegardno@ifi.uio.no>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by Davide Libenzi and committed by Linus Torvalds 7ef9964e b7d271df

+118 -8
+27
Documentation/filesystems/proc.txt
··· 44 2.14 /proc/<pid>/io - Display the IO accounting fields 45 2.15 /proc/<pid>/coredump_filter - Core dump filtering settings 46 2.16 /proc/<pid>/mountinfo - Information about mounts 47 48 ------------------------------------------------------------------------------ 49 Preface ··· 2484 2485 Documentation/filesystems/sharedsubtree.txt 2486 2487 ------------------------------------------------------------------------------
··· 44 2.14 /proc/<pid>/io - Display the IO accounting fields 45 2.15 /proc/<pid>/coredump_filter - Core dump filtering settings 46 2.16 /proc/<pid>/mountinfo - Information about mounts 47 + 2.17 /proc/sys/fs/epoll - Configuration options for the epoll interface 48 49 ------------------------------------------------------------------------------ 50 Preface ··· 2483 2484 Documentation/filesystems/sharedsubtree.txt 2485 2486 + 2.17 /proc/sys/fs/epoll - Configuration options for the epoll interface 2487 + -------------------------------------------------------- 2488 + 2489 + This directory contains configuration options for the epoll(7) interface. 2490 + 2491 + max_user_instances 2492 + ------------------ 2493 + 2494 + This is the maximum number of epoll file descriptors that a single user can 2495 + have open at a given time. The default value is 128, and should be enough 2496 + for normal users. 2497 + 2498 + max_user_watches 2499 + ---------------- 2500 + 2501 + Every epoll file descriptor can store a number of files to be monitored 2502 + for event readiness. Each one of these monitored files constitutes a "watch". 2503 + This configuration option sets the maximum number of "watches" that are 2504 + allowed for each user. 2505 + Each "watch" costs roughly 90 bytes on a 32bit kernel, and roughly 160 bytes 2506 + on a 64bit one. 2507 + The current default value for max_user_watches is the 1/32 of the available 2508 + low memory, divided for the "watch" cost in bytes. 2509 + 2510 + 2511 ------------------------------------------------------------------------------ 2512 +
+77 -8
fs/eventpoll.c
··· 102 103 #define EP_UNACTIVE_PTR ((void *) -1L) 104 105 struct epoll_filefd { 106 struct file *file; 107 int fd; ··· 202 * holding ->lock. 203 */ 204 struct epitem *ovflist; 205 }; 206 207 /* Wait structure used by the poll hooks */ ··· 232 }; 233 234 /* 235 * This mutex is used to serialize ep_free() and eventpoll_release_file(). 236 */ 237 - static struct mutex epmutex; 238 239 /* Safe wake up implementation */ 240 static struct poll_safewake psw; ··· 252 253 /* Slab cache used to allocate "struct eppoll_entry" */ 254 static struct kmem_cache *pwq_cache __read_mostly; 255 256 257 /* Setup the structure that is used as key for the RB tree */ ··· 442 /* At this point it is safe to free the eventpoll item */ 443 kmem_cache_free(epi_cache, epi); 444 445 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_remove(%p, %p)\n", 446 current, ep, file)); 447 ··· 491 492 mutex_unlock(&epmutex); 493 mutex_destroy(&ep->mtx); 494 kfree(ep); 495 } 496 ··· 576 577 static int ep_alloc(struct eventpoll **pep) 578 { 579 - struct eventpoll *ep = kzalloc(sizeof(*ep), GFP_KERNEL); 580 581 - if (!ep) 582 - return -ENOMEM; 583 584 spin_lock_init(&ep->lock); 585 mutex_init(&ep->mtx); ··· 597 INIT_LIST_HEAD(&ep->rdllist); 598 ep->rbr = RB_ROOT; 599 ep->ovflist = EP_UNACTIVE_PTR; 600 601 *pep = ep; 602 603 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_alloc() ep=%p\n", 604 current, ep)); 605 return 0; 606 } 607 608 /* ··· 761 struct epitem *epi; 762 struct ep_pqueue epq; 763 764 - error = -ENOMEM; 765 if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL))) 766 - goto error_return; 767 768 /* Item initialization follow here ... */ 769 INIT_LIST_HEAD(&epi->rdllink); ··· 795 * install process. Namely an allocation for a wait queue failed due 796 * high memory pressure. 797 */ 798 if (epi->nwait < 0) 799 goto error_unregister; 800 ··· 826 827 spin_unlock_irqrestore(&ep->lock, flags); 828 829 /* We have to call this outside the lock */ 830 if (pwake) 831 ep_poll_safewake(&psw, &ep->poll_wait); ··· 852 spin_unlock_irqrestore(&ep->lock, flags); 853 854 kmem_cache_free(epi_cache, epi); 855 - error_return: 856 return error; 857 } 858 ··· 1141 flags & O_CLOEXEC); 1142 if (fd < 0) 1143 ep_free(ep); 1144 1145 error_return: 1146 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n", ··· 1363 1364 static int __init eventpoll_init(void) 1365 { 1366 - mutex_init(&epmutex); 1367 1368 /* Initialize the structure used to perform safe poll wait head wake ups */ 1369 ep_poll_safewake_init(&psw);
··· 102 103 #define EP_UNACTIVE_PTR ((void *) -1L) 104 105 + #define EP_ITEM_COST (sizeof(struct epitem) + sizeof(struct eppoll_entry)) 106 + 107 struct epoll_filefd { 108 struct file *file; 109 int fd; ··· 200 * holding ->lock. 201 */ 202 struct epitem *ovflist; 203 + 204 + /* The user that created the eventpoll descriptor */ 205 + struct user_struct *user; 206 }; 207 208 /* Wait structure used by the poll hooks */ ··· 227 }; 228 229 /* 230 + * Configuration options available inside /proc/sys/fs/epoll/ 231 + */ 232 + /* Maximum number of epoll devices, per user */ 233 + static int max_user_instances __read_mostly; 234 + /* Maximum number of epoll watched descriptors, per user */ 235 + static int max_user_watches __read_mostly; 236 + 237 + /* 238 * This mutex is used to serialize ep_free() and eventpoll_release_file(). 239 */ 240 + static DEFINE_MUTEX(epmutex); 241 242 /* Safe wake up implementation */ 243 static struct poll_safewake psw; ··· 239 240 /* Slab cache used to allocate "struct eppoll_entry" */ 241 static struct kmem_cache *pwq_cache __read_mostly; 242 + 243 + #ifdef CONFIG_SYSCTL 244 + 245 + #include <linux/sysctl.h> 246 + 247 + static int zero; 248 + 249 + ctl_table epoll_table[] = { 250 + { 251 + .procname = "max_user_instances", 252 + .data = &max_user_instances, 253 + .maxlen = sizeof(int), 254 + .mode = 0644, 255 + .proc_handler = &proc_dointvec_minmax, 256 + .extra1 = &zero, 257 + }, 258 + { 259 + .procname = "max_user_watches", 260 + .data = &max_user_watches, 261 + .maxlen = sizeof(int), 262 + .mode = 0644, 263 + .proc_handler = &proc_dointvec_minmax, 264 + .extra1 = &zero, 265 + }, 266 + { .ctl_name = 0 } 267 + }; 268 + #endif /* CONFIG_SYSCTL */ 269 270 271 /* Setup the structure that is used as key for the RB tree */ ··· 402 /* At this point it is safe to free the eventpoll item */ 403 kmem_cache_free(epi_cache, epi); 404 405 + atomic_dec(&ep->user->epoll_watches); 406 + 407 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_remove(%p, %p)\n", 408 current, ep, file)); 409 ··· 449 450 mutex_unlock(&epmutex); 451 mutex_destroy(&ep->mtx); 452 + atomic_dec(&ep->user->epoll_devs); 453 + free_uid(ep->user); 454 kfree(ep); 455 } 456 ··· 532 533 static int ep_alloc(struct eventpoll **pep) 534 { 535 + int error; 536 + struct user_struct *user; 537 + struct eventpoll *ep; 538 539 + user = get_current_user(); 540 + error = -EMFILE; 541 + if (unlikely(atomic_read(&user->epoll_devs) >= 542 + max_user_instances)) 543 + goto free_uid; 544 + error = -ENOMEM; 545 + ep = kzalloc(sizeof(*ep), GFP_KERNEL); 546 + if (unlikely(!ep)) 547 + goto free_uid; 548 549 spin_lock_init(&ep->lock); 550 mutex_init(&ep->mtx); ··· 544 INIT_LIST_HEAD(&ep->rdllist); 545 ep->rbr = RB_ROOT; 546 ep->ovflist = EP_UNACTIVE_PTR; 547 + ep->user = user; 548 549 *pep = ep; 550 551 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_alloc() ep=%p\n", 552 current, ep)); 553 return 0; 554 + 555 + free_uid: 556 + free_uid(user); 557 + return error; 558 } 559 560 /* ··· 703 struct epitem *epi; 704 struct ep_pqueue epq; 705 706 + if (unlikely(atomic_read(&ep->user->epoll_watches) >= 707 + max_user_watches)) 708 + return -ENOSPC; 709 if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL))) 710 + return -ENOMEM; 711 712 /* Item initialization follow here ... */ 713 INIT_LIST_HEAD(&epi->rdllink); ··· 735 * install process. Namely an allocation for a wait queue failed due 736 * high memory pressure. 737 */ 738 + error = -ENOMEM; 739 if (epi->nwait < 0) 740 goto error_unregister; 741 ··· 765 766 spin_unlock_irqrestore(&ep->lock, flags); 767 768 + atomic_inc(&ep->user->epoll_watches); 769 + 770 /* We have to call this outside the lock */ 771 if (pwake) 772 ep_poll_safewake(&psw, &ep->poll_wait); ··· 789 spin_unlock_irqrestore(&ep->lock, flags); 790 791 kmem_cache_free(epi_cache, epi); 792 + 793 return error; 794 } 795 ··· 1078 flags & O_CLOEXEC); 1079 if (fd < 0) 1080 ep_free(ep); 1081 + atomic_inc(&ep->user->epoll_devs); 1082 1083 error_return: 1084 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n", ··· 1299 1300 static int __init eventpoll_init(void) 1301 { 1302 + struct sysinfo si; 1303 + 1304 + si_meminfo(&si); 1305 + max_user_instances = 128; 1306 + max_user_watches = (((si.totalram - si.totalhigh) / 32) << PAGE_SHIFT) / 1307 + EP_ITEM_COST; 1308 1309 /* Initialize the structure used to perform safe poll wait head wake ups */ 1310 ep_poll_safewake_init(&psw);
+4
include/linux/sched.h
··· 630 atomic_t inotify_watches; /* How many inotify watches does this user have? */ 631 atomic_t inotify_devs; /* How many inotify devs does this user have opened? */ 632 #endif 633 #ifdef CONFIG_POSIX_MQUEUE 634 /* protected by mq_lock */ 635 unsigned long mq_bytes; /* How many bytes can be allocated to mqueue? */
··· 630 atomic_t inotify_watches; /* How many inotify watches does this user have? */ 631 atomic_t inotify_devs; /* How many inotify devs does this user have opened? */ 632 #endif 633 + #ifdef CONFIG_EPOLL 634 + atomic_t epoll_devs; /* The number of epoll descriptors currently open */ 635 + atomic_t epoll_watches; /* The number of file descriptors currently watched */ 636 + #endif 637 #ifdef CONFIG_POSIX_MQUEUE 638 /* protected by mq_lock */ 639 unsigned long mq_bytes; /* How many bytes can be allocated to mqueue? */
+10
kernel/sysctl.c
··· 176 #ifdef CONFIG_INOTIFY_USER 177 extern struct ctl_table inotify_table[]; 178 #endif 179 180 #ifdef HAVE_ARCH_PICK_MMAP_LAYOUT 181 int sysctl_legacy_va_layout; ··· 1328 .child = inotify_table, 1329 }, 1330 #endif 1331 #endif 1332 { 1333 .ctl_name = KERN_SETUID_DUMPABLE,
··· 176 #ifdef CONFIG_INOTIFY_USER 177 extern struct ctl_table inotify_table[]; 178 #endif 179 + #ifdef CONFIG_EPOLL 180 + extern struct ctl_table epoll_table[]; 181 + #endif 182 183 #ifdef HAVE_ARCH_PICK_MMAP_LAYOUT 184 int sysctl_legacy_va_layout; ··· 1325 .child = inotify_table, 1326 }, 1327 #endif 1328 + #ifdef CONFIG_EPOLL 1329 + { 1330 + .procname = "epoll", 1331 + .mode = 0555, 1332 + .child = epoll_table, 1333 + }, 1334 + #endif 1335 #endif 1336 { 1337 .ctl_name = KERN_SETUID_DUMPABLE,