epoll: introduce resource usage limits

It has been thought that the per-user file descriptors limit would also
limit the resources that a normal user can request via the epoll
interface. Vegard Nossum reported a very simple program (a modified
version attached) that can make a normal user to request a pretty large
amount of kernel memory, well within the its maximum number of fds. To
solve such problem, default limits are now imposed, and /proc based
configuration has been introduced. A new directory has been created,
named /proc/sys/fs/epoll/ and inside there, there are two configuration
points:

max_user_instances = Maximum number of devices - per user

max_user_watches = Maximum number of "watched" fds - per user

The current default for "max_user_watches" limits the memory used by epoll
to store "watches", to 1/32 of the amount of the low RAM. As example, a
256MB 32bit machine, will have "max_user_watches" set to roughly 90000.
That should be enough to not break existing heavy epoll users. The
default value for "max_user_instances" is set to 128, that should be
enough too.

This also changes the userspace, because a new error code can now come out
from EPOLL_CTL_ADD (-ENOSPC). The EMFILE from epoll_create() was already
listed, so that should be ok.

[akpm@linux-foundation.org: use get_current_user()]
Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: <stable@kernel.org>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Reported-by: Vegard Nossum <vegardno@ifi.uio.no>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by Davide Libenzi and committed by Linus Torvalds 7ef9964e b7d271df

+118 -8
+27
Documentation/filesystems/proc.txt
··· 44 44 2.14 /proc/<pid>/io - Display the IO accounting fields 45 45 2.15 /proc/<pid>/coredump_filter - Core dump filtering settings 46 46 2.16 /proc/<pid>/mountinfo - Information about mounts 47 + 2.17 /proc/sys/fs/epoll - Configuration options for the epoll interface 47 48 48 49 ------------------------------------------------------------------------------ 49 50 Preface ··· 2484 2483 2485 2484 Documentation/filesystems/sharedsubtree.txt 2486 2485 2486 + 2.17 /proc/sys/fs/epoll - Configuration options for the epoll interface 2487 + -------------------------------------------------------- 2488 + 2489 + This directory contains configuration options for the epoll(7) interface. 2490 + 2491 + max_user_instances 2492 + ------------------ 2493 + 2494 + This is the maximum number of epoll file descriptors that a single user can 2495 + have open at a given time. The default value is 128, and should be enough 2496 + for normal users. 2497 + 2498 + max_user_watches 2499 + ---------------- 2500 + 2501 + Every epoll file descriptor can store a number of files to be monitored 2502 + for event readiness. Each one of these monitored files constitutes a "watch". 2503 + This configuration option sets the maximum number of "watches" that are 2504 + allowed for each user. 2505 + Each "watch" costs roughly 90 bytes on a 32bit kernel, and roughly 160 bytes 2506 + on a 64bit one. 2507 + The current default value for max_user_watches is the 1/32 of the available 2508 + low memory, divided for the "watch" cost in bytes. 2509 + 2510 + 2487 2511 ------------------------------------------------------------------------------ 2512 +
+77 -8
fs/eventpoll.c
··· 102 102 103 103 #define EP_UNACTIVE_PTR ((void *) -1L) 104 104 105 + #define EP_ITEM_COST (sizeof(struct epitem) + sizeof(struct eppoll_entry)) 106 + 105 107 struct epoll_filefd { 106 108 struct file *file; 107 109 int fd; ··· 202 200 * holding ->lock. 203 201 */ 204 202 struct epitem *ovflist; 203 + 204 + /* The user that created the eventpoll descriptor */ 205 + struct user_struct *user; 205 206 }; 206 207 207 208 /* Wait structure used by the poll hooks */ ··· 232 227 }; 233 228 234 229 /* 230 + * Configuration options available inside /proc/sys/fs/epoll/ 231 + */ 232 + /* Maximum number of epoll devices, per user */ 233 + static int max_user_instances __read_mostly; 234 + /* Maximum number of epoll watched descriptors, per user */ 235 + static int max_user_watches __read_mostly; 236 + 237 + /* 235 238 * This mutex is used to serialize ep_free() and eventpoll_release_file(). 236 239 */ 237 - static struct mutex epmutex; 240 + static DEFINE_MUTEX(epmutex); 238 241 239 242 /* Safe wake up implementation */ 240 243 static struct poll_safewake psw; ··· 252 239 253 240 /* Slab cache used to allocate "struct eppoll_entry" */ 254 241 static struct kmem_cache *pwq_cache __read_mostly; 242 + 243 + #ifdef CONFIG_SYSCTL 244 + 245 + #include <linux/sysctl.h> 246 + 247 + static int zero; 248 + 249 + ctl_table epoll_table[] = { 250 + { 251 + .procname = "max_user_instances", 252 + .data = &max_user_instances, 253 + .maxlen = sizeof(int), 254 + .mode = 0644, 255 + .proc_handler = &proc_dointvec_minmax, 256 + .extra1 = &zero, 257 + }, 258 + { 259 + .procname = "max_user_watches", 260 + .data = &max_user_watches, 261 + .maxlen = sizeof(int), 262 + .mode = 0644, 263 + .proc_handler = &proc_dointvec_minmax, 264 + .extra1 = &zero, 265 + }, 266 + { .ctl_name = 0 } 267 + }; 268 + #endif /* CONFIG_SYSCTL */ 255 269 256 270 257 271 /* Setup the structure that is used as key for the RB tree */ ··· 442 402 /* At this point it is safe to free the eventpoll item */ 443 403 kmem_cache_free(epi_cache, epi); 444 404 405 + atomic_dec(&ep->user->epoll_watches); 406 + 445 407 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_remove(%p, %p)\n", 446 408 current, ep, file)); 447 409 ··· 491 449 492 450 mutex_unlock(&epmutex); 493 451 mutex_destroy(&ep->mtx); 452 + atomic_dec(&ep->user->epoll_devs); 453 + free_uid(ep->user); 494 454 kfree(ep); 495 455 } 496 456 ··· 576 532 577 533 static int ep_alloc(struct eventpoll **pep) 578 534 { 579 - struct eventpoll *ep = kzalloc(sizeof(*ep), GFP_KERNEL); 535 + int error; 536 + struct user_struct *user; 537 + struct eventpoll *ep; 580 538 581 - if (!ep) 582 - return -ENOMEM; 539 + user = get_current_user(); 540 + error = -EMFILE; 541 + if (unlikely(atomic_read(&user->epoll_devs) >= 542 + max_user_instances)) 543 + goto free_uid; 544 + error = -ENOMEM; 545 + ep = kzalloc(sizeof(*ep), GFP_KERNEL); 546 + if (unlikely(!ep)) 547 + goto free_uid; 583 548 584 549 spin_lock_init(&ep->lock); 585 550 mutex_init(&ep->mtx); ··· 597 544 INIT_LIST_HEAD(&ep->rdllist); 598 545 ep->rbr = RB_ROOT; 599 546 ep->ovflist = EP_UNACTIVE_PTR; 547 + ep->user = user; 600 548 601 549 *pep = ep; 602 550 603 551 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_alloc() ep=%p\n", 604 552 current, ep)); 605 553 return 0; 554 + 555 + free_uid: 556 + free_uid(user); 557 + return error; 606 558 } 607 559 608 560 /* ··· 761 703 struct epitem *epi; 762 704 struct ep_pqueue epq; 763 705 764 - error = -ENOMEM; 706 + if (unlikely(atomic_read(&ep->user->epoll_watches) >= 707 + max_user_watches)) 708 + return -ENOSPC; 765 709 if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL))) 766 - goto error_return; 710 + return -ENOMEM; 767 711 768 712 /* Item initialization follow here ... */ 769 713 INIT_LIST_HEAD(&epi->rdllink); ··· 795 735 * install process. Namely an allocation for a wait queue failed due 796 736 * high memory pressure. 797 737 */ 738 + error = -ENOMEM; 798 739 if (epi->nwait < 0) 799 740 goto error_unregister; 800 741 ··· 826 765 827 766 spin_unlock_irqrestore(&ep->lock, flags); 828 767 768 + atomic_inc(&ep->user->epoll_watches); 769 + 829 770 /* We have to call this outside the lock */ 830 771 if (pwake) 831 772 ep_poll_safewake(&psw, &ep->poll_wait); ··· 852 789 spin_unlock_irqrestore(&ep->lock, flags); 853 790 854 791 kmem_cache_free(epi_cache, epi); 855 - error_return: 792 + 856 793 return error; 857 794 } 858 795 ··· 1141 1078 flags & O_CLOEXEC); 1142 1079 if (fd < 0) 1143 1080 ep_free(ep); 1081 + atomic_inc(&ep->user->epoll_devs); 1144 1082 1145 1083 error_return: 1146 1084 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n", ··· 1363 1299 1364 1300 static int __init eventpoll_init(void) 1365 1301 { 1366 - mutex_init(&epmutex); 1302 + struct sysinfo si; 1303 + 1304 + si_meminfo(&si); 1305 + max_user_instances = 128; 1306 + max_user_watches = (((si.totalram - si.totalhigh) / 32) << PAGE_SHIFT) / 1307 + EP_ITEM_COST; 1367 1308 1368 1309 /* Initialize the structure used to perform safe poll wait head wake ups */ 1369 1310 ep_poll_safewake_init(&psw);
+4
include/linux/sched.h
··· 630 630 atomic_t inotify_watches; /* How many inotify watches does this user have? */ 631 631 atomic_t inotify_devs; /* How many inotify devs does this user have opened? */ 632 632 #endif 633 + #ifdef CONFIG_EPOLL 634 + atomic_t epoll_devs; /* The number of epoll descriptors currently open */ 635 + atomic_t epoll_watches; /* The number of file descriptors currently watched */ 636 + #endif 633 637 #ifdef CONFIG_POSIX_MQUEUE 634 638 /* protected by mq_lock */ 635 639 unsigned long mq_bytes; /* How many bytes can be allocated to mqueue? */
+10
kernel/sysctl.c
··· 176 176 #ifdef CONFIG_INOTIFY_USER 177 177 extern struct ctl_table inotify_table[]; 178 178 #endif 179 + #ifdef CONFIG_EPOLL 180 + extern struct ctl_table epoll_table[]; 181 + #endif 179 182 180 183 #ifdef HAVE_ARCH_PICK_MMAP_LAYOUT 181 184 int sysctl_legacy_va_layout; ··· 1328 1325 .child = inotify_table, 1329 1326 }, 1330 1327 #endif 1328 + #ifdef CONFIG_EPOLL 1329 + { 1330 + .procname = "epoll", 1331 + .mode = 0555, 1332 + .child = epoll_table, 1333 + }, 1334 + #endif 1331 1335 #endif 1332 1336 { 1333 1337 .ctl_name = KERN_SETUID_DUMPABLE,