Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

epoll cleanups: epoll remove static pre-declarations and akpm-ize the code

Re-arrange epoll code to avoid static functions pre-declarations, and apply
akpm-filter on it.

Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Davide Libenzi and committed by
Linus Torvalds
7699acd1 cea69241

+521 -583
+521 -583
fs/eventpoll.c
··· 41 41 #include <asm/atomic.h> 42 42 #include <asm/semaphore.h> 43 43 44 - 45 44 /* 46 45 * LOCKING: 47 46 * There are three level of locking required by epoll : ··· 73 74 * a greater scalability. 74 75 */ 75 76 76 - 77 77 #define DEBUG_EPOLL 0 78 78 79 79 #if DEBUG_EPOLL > 0 ··· 101 103 #define EP_MAX_MSTIMEO min(1000ULL * MAX_SCHEDULE_TIMEOUT / HZ, (LONG_MAX - 999ULL) / HZ) 102 104 103 105 #define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event)) 104 - 105 106 106 107 struct epoll_filefd { 107 108 struct file *file; ··· 219 222 struct epitem *epi; 220 223 }; 221 224 222 - 223 - 224 - static void ep_poll_safewake_init(struct poll_safewake *psw); 225 - static void ep_poll_safewake(struct poll_safewake *psw, wait_queue_head_t *wq); 226 - static int ep_alloc(struct eventpoll **pep); 227 - static void ep_free(struct eventpoll *ep); 228 - static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd); 229 - static void ep_use_epitem(struct epitem *epi); 230 - static void ep_release_epitem(struct epitem *epi); 231 - static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, 232 - poll_table *pt); 233 - static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi); 234 - static int ep_insert(struct eventpoll *ep, struct epoll_event *event, 235 - struct file *tfile, int fd); 236 - static int ep_modify(struct eventpoll *ep, struct epitem *epi, 237 - struct epoll_event *event); 238 - static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi); 239 - static int ep_unlink(struct eventpoll *ep, struct epitem *epi); 240 - static int ep_remove(struct eventpoll *ep, struct epitem *epi); 241 - static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key); 242 - static int ep_eventpoll_close(struct inode *inode, struct file *file); 243 - static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait); 244 - static int ep_send_events(struct eventpoll *ep, struct list_head *txlist, 245 - struct epoll_event __user *events, int maxevents); 246 - static int ep_events_transfer(struct eventpoll *ep, 247 - struct epoll_event __user *events, 248 - int maxevents); 249 - static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, 250 - int maxevents, long timeout); 251 - 252 225 /* 253 226 * This semaphore is used to serialize ep_free() and eventpoll_release_file(). 254 227 */ ··· 233 266 /* Slab cache used to allocate "struct eppoll_entry" */ 234 267 static struct kmem_cache *pwq_cache __read_mostly; 235 268 236 - /* File callbacks that implement the eventpoll file behaviour */ 237 - static const struct file_operations eventpoll_fops = { 238 - .release = ep_eventpoll_close, 239 - .poll = ep_eventpoll_poll 240 - }; 241 - 242 - 243 - 244 - /* Fast test to see if the file is an evenpoll file */ 245 - static inline int is_file_epoll(struct file *f) 246 - { 247 - return f->f_op == &eventpoll_fops; 248 - } 249 269 250 270 /* Setup the structure that is used as key for the rb-tree */ 251 271 static inline void ep_set_ffd(struct epoll_filefd *ffd, ··· 301 347 spin_lock_init(&psw->lock); 302 348 } 303 349 304 - 305 350 /* 306 351 * Perform a safe wake up of the poll wait list. The problem is that 307 352 * with the new callback'd wake up system, it is possible that the ··· 355 402 spin_unlock_irqrestore(&psw->lock, flags); 356 403 } 357 404 405 + /* 406 + * This function unregister poll callbacks from the associated file descriptor. 407 + * Since this must be called without holding "ep->lock" the atomic exchange trick 408 + * will protect us from multiple unregister. 409 + */ 410 + static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi) 411 + { 412 + int nwait; 413 + struct list_head *lsthead = &epi->pwqlist; 414 + struct eppoll_entry *pwq; 415 + 416 + /* This is called without locks, so we need the atomic exchange */ 417 + nwait = xchg(&epi->nwait, 0); 418 + 419 + if (nwait) { 420 + while (!list_empty(lsthead)) { 421 + pwq = list_first_entry(lsthead, struct eppoll_entry, llink); 422 + 423 + list_del_init(&pwq->llink); 424 + remove_wait_queue(pwq->whead, &pwq->wait); 425 + kmem_cache_free(pwq_cache, pwq); 426 + } 427 + } 428 + } 358 429 359 430 /* 360 - * This is called from eventpoll_release() to unlink files from the eventpoll 361 - * interface. We need to have this facility to cleanup correctly files that are 362 - * closed without being removed from the eventpoll interface. 431 + * Unlink the "struct epitem" from all places it might have been hooked up. 432 + * This function must be called with write IRQ lock on "ep->lock". 363 433 */ 364 - void eventpoll_release_file(struct file *file) 434 + static int ep_unlink(struct eventpoll *ep, struct epitem *epi) 365 435 { 366 - struct list_head *lsthead = &file->f_ep_links; 367 - struct eventpoll *ep; 368 - struct epitem *epi; 436 + int error; 369 437 370 438 /* 371 - * We don't want to get "file->f_ep_lock" because it is not 372 - * necessary. It is not necessary because we're in the "struct file" 373 - * cleanup path, and this means that noone is using this file anymore. 374 - * The only hit might come from ep_free() but by holding the semaphore 375 - * will correctly serialize the operation. We do need to acquire 376 - * "ep->sem" after "epmutex" because ep_remove() requires it when called 377 - * from anywhere but ep_free(). 439 + * It can happen that this one is called for an item already unlinked. 440 + * The check protect us from doing a double unlink ( crash ). 378 441 */ 379 - mutex_lock(&epmutex); 442 + error = -ENOENT; 443 + if (!ep_rb_linked(&epi->rbn)) 444 + goto error_return; 380 445 381 - while (!list_empty(lsthead)) { 382 - epi = list_first_entry(lsthead, struct epitem, fllink); 446 + /* 447 + * Clear the event mask for the unlinked item. This will avoid item 448 + * notifications to be sent after the unlink operation from inside 449 + * the kernel->userspace event transfer loop. 450 + */ 451 + epi->event.events = 0; 383 452 384 - ep = epi->ep; 453 + /* 454 + * At this point is safe to do the job, unlink the item from our rb-tree. 455 + * This operation togheter with the above check closes the door to 456 + * double unlinks. 457 + */ 458 + ep_rb_erase(&epi->rbn, &ep->rbr); 459 + 460 + /* 461 + * If the item we are going to remove is inside the ready file descriptors 462 + * we want to remove it from this list to avoid stale events. 463 + */ 464 + if (ep_is_linked(&epi->rdllink)) 465 + list_del_init(&epi->rdllink); 466 + 467 + error = 0; 468 + error_return: 469 + 470 + DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_unlink(%p, %p) = %d\n", 471 + current, ep, epi->ffd.file, error)); 472 + 473 + return error; 474 + } 475 + 476 + /* 477 + * Increment the usage count of the "struct epitem" making it sure 478 + * that the user will have a valid pointer to reference. 479 + */ 480 + static void ep_use_epitem(struct epitem *epi) 481 + { 482 + atomic_inc(&epi->usecnt); 483 + } 484 + 485 + /* 486 + * Decrement ( release ) the usage count by signaling that the user 487 + * has finished using the structure. It might lead to freeing the 488 + * structure itself if the count goes to zero. 489 + */ 490 + static void ep_release_epitem(struct epitem *epi) 491 + { 492 + if (atomic_dec_and_test(&epi->usecnt)) 493 + kmem_cache_free(epi_cache, epi); 494 + } 495 + 496 + /* 497 + * Removes a "struct epitem" from the eventpoll RB tree and deallocates 498 + * all the associated resources. 499 + */ 500 + static int ep_remove(struct eventpoll *ep, struct epitem *epi) 501 + { 502 + int error; 503 + unsigned long flags; 504 + struct file *file = epi->ffd.file; 505 + 506 + /* 507 + * Removes poll wait queue hooks. We _have_ to do this without holding 508 + * the "ep->lock" otherwise a deadlock might occur. This because of the 509 + * sequence of the lock acquisition. Here we do "ep->lock" then the wait 510 + * queue head lock when unregistering the wait queue. The wakeup callback 511 + * will run by holding the wait queue head lock and will call our callback 512 + * that will try to get "ep->lock". 513 + */ 514 + ep_unregister_pollwait(ep, epi); 515 + 516 + /* Remove the current item from the list of epoll hooks */ 517 + spin_lock(&file->f_ep_lock); 518 + if (ep_is_linked(&epi->fllink)) 385 519 list_del_init(&epi->fllink); 386 - down_write(&ep->sem); 387 - ep_remove(ep, epi); 388 - up_write(&ep->sem); 389 - } 520 + spin_unlock(&file->f_ep_lock); 390 521 391 - mutex_unlock(&epmutex); 392 - } 522 + /* We need to acquire the write IRQ lock before calling ep_unlink() */ 523 + write_lock_irqsave(&ep->lock, flags); 393 524 525 + /* Really unlink the item from the RB tree */ 526 + error = ep_unlink(ep, epi); 394 527 395 - /* 396 - * It opens an eventpoll file descriptor by suggesting a storage of "size" 397 - * file descriptors. The size parameter is just an hint about how to size 398 - * data structures. It won't prevent the user to store more than "size" 399 - * file descriptors inside the epoll interface. It is the kernel part of 400 - * the userspace epoll_create(2). 401 - */ 402 - asmlinkage long sys_epoll_create(int size) 403 - { 404 - int error, fd = -1; 405 - struct eventpoll *ep; 406 - struct inode *inode; 407 - struct file *file; 528 + write_unlock_irqrestore(&ep->lock, flags); 408 529 409 - DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)\n", 410 - current, size)); 411 - 412 - /* 413 - * Sanity check on the size parameter, and create the internal data 414 - * structure ( "struct eventpoll" ). 415 - */ 416 - error = -EINVAL; 417 - if (size <= 0 || (error = ep_alloc(&ep)) != 0) 418 - goto eexit_1; 419 - 420 - /* 421 - * Creates all the items needed to setup an eventpoll file. That is, 422 - * a file structure, and inode and a free file descriptor. 423 - */ 424 - error = anon_inode_getfd(&fd, &inode, &file, "[eventpoll]", 425 - &eventpoll_fops, ep); 426 530 if (error) 427 - goto eexit_2; 531 + goto error_return; 428 532 429 - DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n", 430 - current, size, fd)); 533 + /* At this point it is safe to free the eventpoll item */ 534 + ep_release_epitem(epi); 431 535 432 - return fd; 433 - 434 - eexit_2: 435 - ep_free(ep); 436 - kfree(ep); 437 - eexit_1: 438 - DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n", 439 - current, size, error)); 440 - return error; 441 - } 442 - 443 - 444 - /* 445 - * The following function implements the controller interface for 446 - * the eventpoll file that enables the insertion/removal/change of 447 - * file descriptors inside the interest set. It represents 448 - * the kernel part of the user space epoll_ctl(2). 449 - */ 450 - asmlinkage long 451 - sys_epoll_ctl(int epfd, int op, int fd, struct epoll_event __user *event) 452 - { 453 - int error; 454 - struct file *file, *tfile; 455 - struct eventpoll *ep; 456 - struct epitem *epi; 457 - struct epoll_event epds; 458 - 459 - DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p)\n", 460 - current, epfd, op, fd, event)); 461 - 462 - error = -EFAULT; 463 - if (ep_op_has_event(op) && 464 - copy_from_user(&epds, event, sizeof(struct epoll_event))) 465 - goto eexit_1; 466 - 467 - /* Get the "struct file *" for the eventpoll file */ 468 - error = -EBADF; 469 - file = fget(epfd); 470 - if (!file) 471 - goto eexit_1; 472 - 473 - /* Get the "struct file *" for the target file */ 474 - tfile = fget(fd); 475 - if (!tfile) 476 - goto eexit_2; 477 - 478 - /* The target file descriptor must support poll */ 479 - error = -EPERM; 480 - if (!tfile->f_op || !tfile->f_op->poll) 481 - goto eexit_3; 482 - 483 - /* 484 - * We have to check that the file structure underneath the file descriptor 485 - * the user passed to us _is_ an eventpoll file. And also we do not permit 486 - * adding an epoll file descriptor inside itself. 487 - */ 488 - error = -EINVAL; 489 - if (file == tfile || !is_file_epoll(file)) 490 - goto eexit_3; 491 - 492 - /* 493 - * At this point it is safe to assume that the "private_data" contains 494 - * our own data structure. 495 - */ 496 - ep = file->private_data; 497 - 498 - down_write(&ep->sem); 499 - 500 - /* Try to lookup the file inside our RB tree */ 501 - epi = ep_find(ep, tfile, fd); 502 - 503 - error = -EINVAL; 504 - switch (op) { 505 - case EPOLL_CTL_ADD: 506 - if (!epi) { 507 - epds.events |= POLLERR | POLLHUP; 508 - 509 - error = ep_insert(ep, &epds, tfile, fd); 510 - } else 511 - error = -EEXIST; 512 - break; 513 - case EPOLL_CTL_DEL: 514 - if (epi) 515 - error = ep_remove(ep, epi); 516 - else 517 - error = -ENOENT; 518 - break; 519 - case EPOLL_CTL_MOD: 520 - if (epi) { 521 - epds.events |= POLLERR | POLLHUP; 522 - error = ep_modify(ep, epi, &epds); 523 - } else 524 - error = -ENOENT; 525 - break; 526 - } 527 - 528 - /* 529 - * The function ep_find() increments the usage count of the structure 530 - * so, if this is not NULL, we need to release it. 531 - */ 532 - if (epi) 533 - ep_release_epitem(epi); 534 - 535 - up_write(&ep->sem); 536 - 537 - eexit_3: 538 - fput(tfile); 539 - eexit_2: 540 - fput(file); 541 - eexit_1: 542 - DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p) = %d\n", 543 - current, epfd, op, fd, event, error)); 536 + error = 0; 537 + error_return: 538 + DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_remove(%p, %p) = %d\n", 539 + current, ep, file, error)); 544 540 545 541 return error; 546 542 } 547 - 548 - 549 - /* 550 - * Implement the event wait interface for the eventpoll file. It is the kernel 551 - * part of the user space epoll_wait(2). 552 - */ 553 - asmlinkage long sys_epoll_wait(int epfd, struct epoll_event __user *events, 554 - int maxevents, int timeout) 555 - { 556 - int error; 557 - struct file *file; 558 - struct eventpoll *ep; 559 - 560 - DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d)\n", 561 - current, epfd, events, maxevents, timeout)); 562 - 563 - /* The maximum number of event must be greater than zero */ 564 - if (maxevents <= 0 || maxevents > EP_MAX_EVENTS) 565 - return -EINVAL; 566 - 567 - /* Verify that the area passed by the user is writeable */ 568 - if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event))) { 569 - error = -EFAULT; 570 - goto eexit_1; 571 - } 572 - 573 - /* Get the "struct file *" for the eventpoll file */ 574 - error = -EBADF; 575 - file = fget(epfd); 576 - if (!file) 577 - goto eexit_1; 578 - 579 - /* 580 - * We have to check that the file structure underneath the fd 581 - * the user passed to us _is_ an eventpoll file. 582 - */ 583 - error = -EINVAL; 584 - if (!is_file_epoll(file)) 585 - goto eexit_2; 586 - 587 - /* 588 - * At this point it is safe to assume that the "private_data" contains 589 - * our own data structure. 590 - */ 591 - ep = file->private_data; 592 - 593 - /* Time to fish for events ... */ 594 - error = ep_poll(ep, events, maxevents, timeout); 595 - 596 - eexit_2: 597 - fput(file); 598 - eexit_1: 599 - DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d) = %d\n", 600 - current, epfd, events, maxevents, timeout, error)); 601 - 602 - return error; 603 - } 604 - 605 - 606 - #ifdef TIF_RESTORE_SIGMASK 607 - 608 - /* 609 - * Implement the event wait interface for the eventpoll file. It is the kernel 610 - * part of the user space epoll_pwait(2). 611 - */ 612 - asmlinkage long sys_epoll_pwait(int epfd, struct epoll_event __user *events, 613 - int maxevents, int timeout, const sigset_t __user *sigmask, 614 - size_t sigsetsize) 615 - { 616 - int error; 617 - sigset_t ksigmask, sigsaved; 618 - 619 - /* 620 - * If the caller wants a certain signal mask to be set during the wait, 621 - * we apply it here. 622 - */ 623 - if (sigmask) { 624 - if (sigsetsize != sizeof(sigset_t)) 625 - return -EINVAL; 626 - if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask))) 627 - return -EFAULT; 628 - sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP)); 629 - sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); 630 - } 631 - 632 - error = sys_epoll_wait(epfd, events, maxevents, timeout); 633 - 634 - /* 635 - * If we changed the signal mask, we need to restore the original one. 636 - * In case we've got a signal while waiting, we do not restore the 637 - * signal mask yet, and we allow do_signal() to deliver the signal on 638 - * the way back to userspace, before the signal mask is restored. 639 - */ 640 - if (sigmask) { 641 - if (error == -EINTR) { 642 - memcpy(&current->saved_sigmask, &sigsaved, 643 - sizeof(sigsaved)); 644 - set_thread_flag(TIF_RESTORE_SIGMASK); 645 - } else 646 - sigprocmask(SIG_SETMASK, &sigsaved, NULL); 647 - } 648 - 649 - return error; 650 - } 651 - 652 - #endif /* #ifdef TIF_RESTORE_SIGMASK */ 653 - 654 - 655 - static int ep_alloc(struct eventpoll **pep) 656 - { 657 - struct eventpoll *ep = kzalloc(sizeof(*ep), GFP_KERNEL); 658 - 659 - if (!ep) 660 - return -ENOMEM; 661 - 662 - rwlock_init(&ep->lock); 663 - init_rwsem(&ep->sem); 664 - init_waitqueue_head(&ep->wq); 665 - init_waitqueue_head(&ep->poll_wait); 666 - INIT_LIST_HEAD(&ep->rdllist); 667 - ep->rbr = RB_ROOT; 668 - 669 - *pep = ep; 670 - 671 - DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_alloc() ep=%p\n", 672 - current, ep)); 673 - return 0; 674 - } 675 - 676 543 677 544 static void ep_free(struct eventpoll *ep) 678 545 { ··· 536 763 mutex_unlock(&epmutex); 537 764 } 538 765 766 + static int ep_eventpoll_release(struct inode *inode, struct file *file) 767 + { 768 + struct eventpoll *ep = file->private_data; 769 + 770 + if (ep) { 771 + ep_free(ep); 772 + kfree(ep); 773 + } 774 + 775 + DNPRINTK(3, (KERN_INFO "[%p] eventpoll: close() ep=%p\n", current, ep)); 776 + return 0; 777 + } 778 + 779 + static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait) 780 + { 781 + unsigned int pollflags = 0; 782 + unsigned long flags; 783 + struct eventpoll *ep = file->private_data; 784 + 785 + /* Insert inside our poll wait queue */ 786 + poll_wait(file, &ep->poll_wait, wait); 787 + 788 + /* Check our condition */ 789 + read_lock_irqsave(&ep->lock, flags); 790 + if (!list_empty(&ep->rdllist)) 791 + pollflags = POLLIN | POLLRDNORM; 792 + read_unlock_irqrestore(&ep->lock, flags); 793 + 794 + return pollflags; 795 + } 796 + 797 + /* File callbacks that implement the eventpoll file behaviour */ 798 + static const struct file_operations eventpoll_fops = { 799 + .release = ep_eventpoll_release, 800 + .poll = ep_eventpoll_poll 801 + }; 802 + 803 + /* Fast test to see if the file is an evenpoll file */ 804 + static inline int is_file_epoll(struct file *f) 805 + { 806 + return f->f_op == &eventpoll_fops; 807 + } 808 + 809 + /* 810 + * This is called from eventpoll_release() to unlink files from the eventpoll 811 + * interface. We need to have this facility to cleanup correctly files that are 812 + * closed without being removed from the eventpoll interface. 813 + */ 814 + void eventpoll_release_file(struct file *file) 815 + { 816 + struct list_head *lsthead = &file->f_ep_links; 817 + struct eventpoll *ep; 818 + struct epitem *epi; 819 + 820 + /* 821 + * We don't want to get "file->f_ep_lock" because it is not 822 + * necessary. It is not necessary because we're in the "struct file" 823 + * cleanup path, and this means that noone is using this file anymore. 824 + * The only hit might come from ep_free() but by holding the semaphore 825 + * will correctly serialize the operation. We do need to acquire 826 + * "ep->sem" after "epmutex" because ep_remove() requires it when called 827 + * from anywhere but ep_free(). 828 + */ 829 + mutex_lock(&epmutex); 830 + 831 + while (!list_empty(lsthead)) { 832 + epi = list_first_entry(lsthead, struct epitem, fllink); 833 + 834 + ep = epi->ep; 835 + list_del_init(&epi->fllink); 836 + down_write(&ep->sem); 837 + ep_remove(ep, epi); 838 + up_write(&ep->sem); 839 + } 840 + 841 + mutex_unlock(&epmutex); 842 + } 843 + 844 + static int ep_alloc(struct eventpoll **pep) 845 + { 846 + struct eventpoll *ep = kzalloc(sizeof(*ep), GFP_KERNEL); 847 + 848 + if (!ep) 849 + return -ENOMEM; 850 + 851 + rwlock_init(&ep->lock); 852 + init_rwsem(&ep->sem); 853 + init_waitqueue_head(&ep->wq); 854 + init_waitqueue_head(&ep->poll_wait); 855 + INIT_LIST_HEAD(&ep->rdllist); 856 + ep->rbr = RB_ROOT; 857 + 858 + *pep = ep; 859 + 860 + DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_alloc() ep=%p\n", 861 + current, ep)); 862 + return 0; 863 + } 539 864 540 865 /* 541 866 * Search the file inside the eventpoll tree. It add usage count to ··· 671 800 return epir; 672 801 } 673 802 674 - 675 803 /* 676 - * Increment the usage count of the "struct epitem" making it sure 677 - * that the user will have a valid pointer to reference. 804 + * This is the callback that is passed to the wait queue wakeup 805 + * machanism. It is called by the stored file descriptors when they 806 + * have events to report. 678 807 */ 679 - static void ep_use_epitem(struct epitem *epi) 808 + static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key) 680 809 { 810 + int pwake = 0; 811 + unsigned long flags; 812 + struct epitem *epi = ep_item_from_wait(wait); 813 + struct eventpoll *ep = epi->ep; 681 814 682 - atomic_inc(&epi->usecnt); 815 + DNPRINTK(3, (KERN_INFO "[%p] eventpoll: poll_callback(%p) epi=%p ep=%p\n", 816 + current, epi->ffd.file, epi, ep)); 817 + 818 + write_lock_irqsave(&ep->lock, flags); 819 + 820 + /* 821 + * If the event mask does not contain any poll(2) event, we consider the 822 + * descriptor to be disabled. This condition is likely the effect of the 823 + * EPOLLONESHOT bit that disables the descriptor when an event is received, 824 + * until the next EPOLL_CTL_MOD will be issued. 825 + */ 826 + if (!(epi->event.events & ~EP_PRIVATE_BITS)) 827 + goto is_disabled; 828 + 829 + /* If this file is already in the ready list we exit soon */ 830 + if (ep_is_linked(&epi->rdllink)) 831 + goto is_linked; 832 + 833 + list_add_tail(&epi->rdllink, &ep->rdllist); 834 + 835 + is_linked: 836 + /* 837 + * Wake up ( if active ) both the eventpoll wait list and the ->poll() 838 + * wait list. 839 + */ 840 + if (waitqueue_active(&ep->wq)) 841 + __wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE | 842 + TASK_INTERRUPTIBLE); 843 + if (waitqueue_active(&ep->poll_wait)) 844 + pwake++; 845 + 846 + is_disabled: 847 + write_unlock_irqrestore(&ep->lock, flags); 848 + 849 + /* We have to call this outside the lock */ 850 + if (pwake) 851 + ep_poll_safewake(&psw, &ep->poll_wait); 852 + 853 + return 1; 683 854 } 684 - 685 - 686 - /* 687 - * Decrement ( release ) the usage count by signaling that the user 688 - * has finished using the structure. It might lead to freeing the 689 - * structure itself if the count goes to zero. 690 - */ 691 - static void ep_release_epitem(struct epitem *epi) 692 - { 693 - 694 - if (atomic_dec_and_test(&epi->usecnt)) 695 - kmem_cache_free(epi_cache, epi); 696 - } 697 - 698 855 699 856 /* 700 857 * This is the callback that is used to add our wait queue to the ··· 747 848 } 748 849 } 749 850 750 - 751 851 static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi) 752 852 { 753 853 int kcmp; ··· 766 868 rb_insert_color(&epi->rbn, &ep->rbr); 767 869 } 768 870 769 - 770 871 static int ep_insert(struct eventpoll *ep, struct epoll_event *event, 771 872 struct file *tfile, int fd) 772 873 { ··· 776 879 777 880 error = -ENOMEM; 778 881 if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL))) 779 - goto eexit_1; 882 + goto error_return; 780 883 781 884 /* Item initialization follow here ... */ 782 885 ep_rb_initnode(&epi->rbn); ··· 806 909 * high memory pressure. 807 910 */ 808 911 if (epi->nwait < 0) 809 - goto eexit_2; 912 + goto error_unregister; 810 913 811 914 /* Add the current item to the list of active epoll hook for this file */ 812 915 spin_lock(&tfile->f_ep_lock); ··· 841 944 842 945 return 0; 843 946 844 - eexit_2: 947 + error_unregister: 845 948 ep_unregister_pollwait(ep, epi); 846 949 847 950 /* ··· 854 957 write_unlock_irqrestore(&ep->lock, flags); 855 958 856 959 kmem_cache_free(epi_cache, epi); 857 - eexit_1: 960 + error_return: 858 961 return error; 859 962 } 860 - 861 963 862 964 /* 863 965 * Modify the interest event mask by dropping an event if the new mask ··· 919 1023 920 1024 return 0; 921 1025 } 922 - 923 - 924 - /* 925 - * This function unregister poll callbacks from the associated file descriptor. 926 - * Since this must be called without holding "ep->lock" the atomic exchange trick 927 - * will protect us from multiple unregister. 928 - */ 929 - static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi) 930 - { 931 - int nwait; 932 - struct list_head *lsthead = &epi->pwqlist; 933 - struct eppoll_entry *pwq; 934 - 935 - /* This is called without locks, so we need the atomic exchange */ 936 - nwait = xchg(&epi->nwait, 0); 937 - 938 - if (nwait) { 939 - while (!list_empty(lsthead)) { 940 - pwq = list_first_entry(lsthead, struct eppoll_entry, llink); 941 - 942 - list_del_init(&pwq->llink); 943 - remove_wait_queue(pwq->whead, &pwq->wait); 944 - kmem_cache_free(pwq_cache, pwq); 945 - } 946 - } 947 - } 948 - 949 - 950 - /* 951 - * Unlink the "struct epitem" from all places it might have been hooked up. 952 - * This function must be called with write IRQ lock on "ep->lock". 953 - */ 954 - static int ep_unlink(struct eventpoll *ep, struct epitem *epi) 955 - { 956 - int error; 957 - 958 - /* 959 - * It can happen that this one is called for an item already unlinked. 960 - * The check protect us from doing a double unlink ( crash ). 961 - */ 962 - error = -ENOENT; 963 - if (!ep_rb_linked(&epi->rbn)) 964 - goto eexit_1; 965 - 966 - /* 967 - * Clear the event mask for the unlinked item. This will avoid item 968 - * notifications to be sent after the unlink operation from inside 969 - * the kernel->userspace event transfer loop. 970 - */ 971 - epi->event.events = 0; 972 - 973 - /* 974 - * At this point is safe to do the job, unlink the item from our rb-tree. 975 - * This operation togheter with the above check closes the door to 976 - * double unlinks. 977 - */ 978 - ep_rb_erase(&epi->rbn, &ep->rbr); 979 - 980 - /* 981 - * If the item we are going to remove is inside the ready file descriptors 982 - * we want to remove it from this list to avoid stale events. 983 - */ 984 - if (ep_is_linked(&epi->rdllink)) 985 - list_del_init(&epi->rdllink); 986 - 987 - error = 0; 988 - eexit_1: 989 - 990 - DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_unlink(%p, %p) = %d\n", 991 - current, ep, epi->ffd.file, error)); 992 - 993 - return error; 994 - } 995 - 996 - 997 - /* 998 - * Removes a "struct epitem" from the eventpoll RB tree and deallocates 999 - * all the associated resources. 1000 - */ 1001 - static int ep_remove(struct eventpoll *ep, struct epitem *epi) 1002 - { 1003 - int error; 1004 - unsigned long flags; 1005 - struct file *file = epi->ffd.file; 1006 - 1007 - /* 1008 - * Removes poll wait queue hooks. We _have_ to do this without holding 1009 - * the "ep->lock" otherwise a deadlock might occur. This because of the 1010 - * sequence of the lock acquisition. Here we do "ep->lock" then the wait 1011 - * queue head lock when unregistering the wait queue. The wakeup callback 1012 - * will run by holding the wait queue head lock and will call our callback 1013 - * that will try to get "ep->lock". 1014 - */ 1015 - ep_unregister_pollwait(ep, epi); 1016 - 1017 - /* Remove the current item from the list of epoll hooks */ 1018 - spin_lock(&file->f_ep_lock); 1019 - if (ep_is_linked(&epi->fllink)) 1020 - list_del_init(&epi->fllink); 1021 - spin_unlock(&file->f_ep_lock); 1022 - 1023 - /* We need to acquire the write IRQ lock before calling ep_unlink() */ 1024 - write_lock_irqsave(&ep->lock, flags); 1025 - 1026 - /* Really unlink the item from the RB tree */ 1027 - error = ep_unlink(ep, epi); 1028 - 1029 - write_unlock_irqrestore(&ep->lock, flags); 1030 - 1031 - if (error) 1032 - goto eexit_1; 1033 - 1034 - /* At this point it is safe to free the eventpoll item */ 1035 - ep_release_epitem(epi); 1036 - 1037 - error = 0; 1038 - eexit_1: 1039 - DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_remove(%p, %p) = %d\n", 1040 - current, ep, file, error)); 1041 - 1042 - return error; 1043 - } 1044 - 1045 - 1046 - /* 1047 - * This is the callback that is passed to the wait queue wakeup 1048 - * machanism. It is called by the stored file descriptors when they 1049 - * have events to report. 1050 - */ 1051 - static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key) 1052 - { 1053 - int pwake = 0; 1054 - unsigned long flags; 1055 - struct epitem *epi = ep_item_from_wait(wait); 1056 - struct eventpoll *ep = epi->ep; 1057 - 1058 - DNPRINTK(3, (KERN_INFO "[%p] eventpoll: poll_callback(%p) epi=%p ep=%p\n", 1059 - current, epi->ffd.file, epi, ep)); 1060 - 1061 - write_lock_irqsave(&ep->lock, flags); 1062 - 1063 - /* 1064 - * If the event mask does not contain any poll(2) event, we consider the 1065 - * descriptor to be disabled. This condition is likely the effect of the 1066 - * EPOLLONESHOT bit that disables the descriptor when an event is received, 1067 - * until the next EPOLL_CTL_MOD will be issued. 1068 - */ 1069 - if (!(epi->event.events & ~EP_PRIVATE_BITS)) 1070 - goto is_disabled; 1071 - 1072 - /* If this file is already in the ready list we exit soon */ 1073 - if (ep_is_linked(&epi->rdllink)) 1074 - goto is_linked; 1075 - 1076 - list_add_tail(&epi->rdllink, &ep->rdllist); 1077 - 1078 - is_linked: 1079 - /* 1080 - * Wake up ( if active ) both the eventpoll wait list and the ->poll() 1081 - * wait list. 1082 - */ 1083 - if (waitqueue_active(&ep->wq)) 1084 - __wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE | 1085 - TASK_INTERRUPTIBLE); 1086 - if (waitqueue_active(&ep->poll_wait)) 1087 - pwake++; 1088 - 1089 - is_disabled: 1090 - write_unlock_irqrestore(&ep->lock, flags); 1091 - 1092 - /* We have to call this outside the lock */ 1093 - if (pwake) 1094 - ep_poll_safewake(&psw, &ep->poll_wait); 1095 - 1096 - return 1; 1097 - } 1098 - 1099 - 1100 - static int ep_eventpoll_close(struct inode *inode, struct file *file) 1101 - { 1102 - struct eventpoll *ep = file->private_data; 1103 - 1104 - if (ep) { 1105 - ep_free(ep); 1106 - kfree(ep); 1107 - } 1108 - 1109 - DNPRINTK(3, (KERN_INFO "[%p] eventpoll: close() ep=%p\n", current, ep)); 1110 - return 0; 1111 - } 1112 - 1113 - 1114 - static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait) 1115 - { 1116 - unsigned int pollflags = 0; 1117 - unsigned long flags; 1118 - struct eventpoll *ep = file->private_data; 1119 - 1120 - /* Insert inside our poll wait queue */ 1121 - poll_wait(file, &ep->poll_wait, wait); 1122 - 1123 - /* Check our condition */ 1124 - read_lock_irqsave(&ep->lock, flags); 1125 - if (!list_empty(&ep->rdllist)) 1126 - pollflags = POLLIN | POLLRDNORM; 1127 - read_unlock_irqrestore(&ep->lock, flags); 1128 - 1129 - return pollflags; 1130 - } 1131 - 1132 1026 1133 1027 /* 1134 1028 * This function is called without holding the "ep->lock" since the call to ··· 1031 1345 return eventcnt == 0 ? error: eventcnt; 1032 1346 } 1033 1347 1034 - 1035 1348 /* 1036 1349 * Perform the transfer of events to user space. 1037 1350 */ ··· 1065 1380 1066 1381 return eventcnt; 1067 1382 } 1068 - 1069 1383 1070 1384 static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, 1071 1385 int maxevents, long timeout) ··· 1134 1450 1135 1451 return res; 1136 1452 } 1453 + 1454 + /* 1455 + * It opens an eventpoll file descriptor by suggesting a storage of "size" 1456 + * file descriptors. The size parameter is just an hint about how to size 1457 + * data structures. It won't prevent the user to store more than "size" 1458 + * file descriptors inside the epoll interface. It is the kernel part of 1459 + * the userspace epoll_create(2). 1460 + */ 1461 + asmlinkage long sys_epoll_create(int size) 1462 + { 1463 + int error, fd = -1; 1464 + struct eventpoll *ep; 1465 + struct inode *inode; 1466 + struct file *file; 1467 + 1468 + DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)\n", 1469 + current, size)); 1470 + 1471 + /* 1472 + * Sanity check on the size parameter, and create the internal data 1473 + * structure ( "struct eventpoll" ). 1474 + */ 1475 + error = -EINVAL; 1476 + if (size <= 0 || (error = ep_alloc(&ep)) != 0) 1477 + goto error_return; 1478 + 1479 + /* 1480 + * Creates all the items needed to setup an eventpoll file. That is, 1481 + * a file structure, and inode and a free file descriptor. 1482 + */ 1483 + error = anon_inode_getfd(&fd, &inode, &file, "[eventpoll]", 1484 + &eventpoll_fops, ep); 1485 + if (error) 1486 + goto error_free; 1487 + 1488 + DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n", 1489 + current, size, fd)); 1490 + 1491 + return fd; 1492 + 1493 + error_free: 1494 + ep_free(ep); 1495 + kfree(ep); 1496 + error_return: 1497 + DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n", 1498 + current, size, error)); 1499 + return error; 1500 + } 1501 + 1502 + /* 1503 + * The following function implements the controller interface for 1504 + * the eventpoll file that enables the insertion/removal/change of 1505 + * file descriptors inside the interest set. It represents 1506 + * the kernel part of the user space epoll_ctl(2). 1507 + */ 1508 + asmlinkage long sys_epoll_ctl(int epfd, int op, int fd, 1509 + struct epoll_event __user *event) 1510 + { 1511 + int error; 1512 + struct file *file, *tfile; 1513 + struct eventpoll *ep; 1514 + struct epitem *epi; 1515 + struct epoll_event epds; 1516 + 1517 + DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p)\n", 1518 + current, epfd, op, fd, event)); 1519 + 1520 + error = -EFAULT; 1521 + if (ep_op_has_event(op) && 1522 + copy_from_user(&epds, event, sizeof(struct epoll_event))) 1523 + goto error_return; 1524 + 1525 + /* Get the "struct file *" for the eventpoll file */ 1526 + error = -EBADF; 1527 + file = fget(epfd); 1528 + if (!file) 1529 + goto error_return; 1530 + 1531 + /* Get the "struct file *" for the target file */ 1532 + tfile = fget(fd); 1533 + if (!tfile) 1534 + goto error_fput; 1535 + 1536 + /* The target file descriptor must support poll */ 1537 + error = -EPERM; 1538 + if (!tfile->f_op || !tfile->f_op->poll) 1539 + goto error_tgt_fput; 1540 + 1541 + /* 1542 + * We have to check that the file structure underneath the file descriptor 1543 + * the user passed to us _is_ an eventpoll file. And also we do not permit 1544 + * adding an epoll file descriptor inside itself. 1545 + */ 1546 + error = -EINVAL; 1547 + if (file == tfile || !is_file_epoll(file)) 1548 + goto error_tgt_fput; 1549 + 1550 + /* 1551 + * At this point it is safe to assume that the "private_data" contains 1552 + * our own data structure. 1553 + */ 1554 + ep = file->private_data; 1555 + 1556 + down_write(&ep->sem); 1557 + 1558 + /* Try to lookup the file inside our RB tree */ 1559 + epi = ep_find(ep, tfile, fd); 1560 + 1561 + error = -EINVAL; 1562 + switch (op) { 1563 + case EPOLL_CTL_ADD: 1564 + if (!epi) { 1565 + epds.events |= POLLERR | POLLHUP; 1566 + 1567 + error = ep_insert(ep, &epds, tfile, fd); 1568 + } else 1569 + error = -EEXIST; 1570 + break; 1571 + case EPOLL_CTL_DEL: 1572 + if (epi) 1573 + error = ep_remove(ep, epi); 1574 + else 1575 + error = -ENOENT; 1576 + break; 1577 + case EPOLL_CTL_MOD: 1578 + if (epi) { 1579 + epds.events |= POLLERR | POLLHUP; 1580 + error = ep_modify(ep, epi, &epds); 1581 + } else 1582 + error = -ENOENT; 1583 + break; 1584 + } 1585 + /* 1586 + * The function ep_find() increments the usage count of the structure 1587 + * so, if this is not NULL, we need to release it. 1588 + */ 1589 + if (epi) 1590 + ep_release_epitem(epi); 1591 + up_write(&ep->sem); 1592 + 1593 + error_tgt_fput: 1594 + fput(tfile); 1595 + error_fput: 1596 + fput(file); 1597 + error_return: 1598 + DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p) = %d\n", 1599 + current, epfd, op, fd, event, error)); 1600 + 1601 + return error; 1602 + } 1603 + 1604 + /* 1605 + * Implement the event wait interface for the eventpoll file. It is the kernel 1606 + * part of the user space epoll_wait(2). 1607 + */ 1608 + asmlinkage long sys_epoll_wait(int epfd, struct epoll_event __user *events, 1609 + int maxevents, int timeout) 1610 + { 1611 + int error; 1612 + struct file *file; 1613 + struct eventpoll *ep; 1614 + 1615 + DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d)\n", 1616 + current, epfd, events, maxevents, timeout)); 1617 + 1618 + /* The maximum number of event must be greater than zero */ 1619 + if (maxevents <= 0 || maxevents > EP_MAX_EVENTS) 1620 + return -EINVAL; 1621 + 1622 + /* Verify that the area passed by the user is writeable */ 1623 + if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event))) { 1624 + error = -EFAULT; 1625 + goto error_return; 1626 + } 1627 + 1628 + /* Get the "struct file *" for the eventpoll file */ 1629 + error = -EBADF; 1630 + file = fget(epfd); 1631 + if (!file) 1632 + goto error_return; 1633 + 1634 + /* 1635 + * We have to check that the file structure underneath the fd 1636 + * the user passed to us _is_ an eventpoll file. 1637 + */ 1638 + error = -EINVAL; 1639 + if (!is_file_epoll(file)) 1640 + goto error_fput; 1641 + 1642 + /* 1643 + * At this point it is safe to assume that the "private_data" contains 1644 + * our own data structure. 1645 + */ 1646 + ep = file->private_data; 1647 + 1648 + /* Time to fish for events ... */ 1649 + error = ep_poll(ep, events, maxevents, timeout); 1650 + 1651 + error_fput: 1652 + fput(file); 1653 + error_return: 1654 + DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d) = %d\n", 1655 + current, epfd, events, maxevents, timeout, error)); 1656 + 1657 + return error; 1658 + } 1659 + 1660 + #ifdef TIF_RESTORE_SIGMASK 1661 + 1662 + /* 1663 + * Implement the event wait interface for the eventpoll file. It is the kernel 1664 + * part of the user space epoll_pwait(2). 1665 + */ 1666 + asmlinkage long sys_epoll_pwait(int epfd, struct epoll_event __user *events, 1667 + int maxevents, int timeout, const sigset_t __user *sigmask, 1668 + size_t sigsetsize) 1669 + { 1670 + int error; 1671 + sigset_t ksigmask, sigsaved; 1672 + 1673 + /* 1674 + * If the caller wants a certain signal mask to be set during the wait, 1675 + * we apply it here. 1676 + */ 1677 + if (sigmask) { 1678 + if (sigsetsize != sizeof(sigset_t)) 1679 + return -EINVAL; 1680 + if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask))) 1681 + return -EFAULT; 1682 + sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP)); 1683 + sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); 1684 + } 1685 + 1686 + error = sys_epoll_wait(epfd, events, maxevents, timeout); 1687 + 1688 + /* 1689 + * If we changed the signal mask, we need to restore the original one. 1690 + * In case we've got a signal while waiting, we do not restore the 1691 + * signal mask yet, and we allow do_signal() to deliver the signal on 1692 + * the way back to userspace, before the signal mask is restored. 1693 + */ 1694 + if (sigmask) { 1695 + if (error == -EINTR) { 1696 + memcpy(&current->saved_sigmask, &sigsaved, 1697 + sizeof(sigsaved)); 1698 + set_thread_flag(TIF_RESTORE_SIGMASK); 1699 + } else 1700 + sigprocmask(SIG_SETMASK, &sigsaved, NULL); 1701 + } 1702 + 1703 + return error; 1704 + } 1705 + 1706 + #endif /* #ifdef TIF_RESTORE_SIGMASK */ 1137 1707 1138 1708 static int __init eventpoll_init(void) 1139 1709 {