Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

aio: fix use-after-free due to missing POLLFREE handling

signalfd_poll() and binder_poll() are special in that they use a
waitqueue whose lifetime is the current task, rather than the struct
file as is normally the case. This is okay for blocking polls, since a
blocking poll occurs within one task; however, non-blocking polls
require another solution. This solution is for the queue to be cleared
before it is freed, by sending a POLLFREE notification to all waiters.

Unfortunately, only eventpoll handles POLLFREE. A second type of
non-blocking poll, aio poll, was added in kernel v4.18, and it doesn't
handle POLLFREE. This allows a use-after-free to occur if a signalfd or
binder fd is polled with aio poll, and the waitqueue gets freed.

Fix this by making aio poll handle POLLFREE.

A patch by Ramji Jiyani <ramjiyani@google.com>
(https://lore.kernel.org/r/20211027011834.2497484-1-ramjiyani@google.com)
tried to do this by making aio_poll_wake() always complete the request
inline if POLLFREE is seen. However, that solution had two bugs.
First, it introduced a deadlock, as it unconditionally locked the aio
context while holding the waitqueue lock, which inverts the normal
locking order. Second, it didn't consider that POLLFREE notifications
are missed while the request has been temporarily de-queued.

The second problem was solved by my previous patch. This patch then
properly fixes the use-after-free by handling POLLFREE in a
deadlock-free way. It does this by taking advantage of the fact that
freeing of the waitqueue is RCU-delayed, similar to what eventpoll does.

Fixes: 2c14fa838cbe ("aio: implement IOCB_CMD_POLL")
Cc: <stable@vger.kernel.org> # v4.18+
Link: https://lore.kernel.org/r/20211209010455.42744-6-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@google.com>

+107 -32
+106 -31
fs/aio.c
··· 1620 1620 iocb_put(iocb); 1621 1621 } 1622 1622 1623 + /* 1624 + * Safely lock the waitqueue which the request is on, synchronizing with the 1625 + * case where the ->poll() provider decides to free its waitqueue early. 1626 + * 1627 + * Returns true on success, meaning that req->head->lock was locked, req->wait 1628 + * is on req->head, and an RCU read lock was taken. Returns false if the 1629 + * request was already removed from its waitqueue (which might no longer exist). 1630 + */ 1631 + static bool poll_iocb_lock_wq(struct poll_iocb *req) 1632 + { 1633 + wait_queue_head_t *head; 1634 + 1635 + /* 1636 + * While we hold the waitqueue lock and the waitqueue is nonempty, 1637 + * wake_up_pollfree() will wait for us. However, taking the waitqueue 1638 + * lock in the first place can race with the waitqueue being freed. 1639 + * 1640 + * We solve this as eventpoll does: by taking advantage of the fact that 1641 + * all users of wake_up_pollfree() will RCU-delay the actual free. If 1642 + * we enter rcu_read_lock() and see that the pointer to the queue is 1643 + * non-NULL, we can then lock it without the memory being freed out from 1644 + * under us, then check whether the request is still on the queue. 1645 + * 1646 + * Keep holding rcu_read_lock() as long as we hold the queue lock, in 1647 + * case the caller deletes the entry from the queue, leaving it empty. 1648 + * In that case, only RCU prevents the queue memory from being freed. 1649 + */ 1650 + rcu_read_lock(); 1651 + head = smp_load_acquire(&req->head); 1652 + if (head) { 1653 + spin_lock(&head->lock); 1654 + if (!list_empty(&req->wait.entry)) 1655 + return true; 1656 + spin_unlock(&head->lock); 1657 + } 1658 + rcu_read_unlock(); 1659 + return false; 1660 + } 1661 + 1662 + static void poll_iocb_unlock_wq(struct poll_iocb *req) 1663 + { 1664 + spin_unlock(&req->head->lock); 1665 + rcu_read_unlock(); 1666 + } 1667 + 1623 1668 static void aio_poll_complete_work(struct work_struct *work) 1624 1669 { 1625 1670 struct poll_iocb *req = container_of(work, struct poll_iocb, work); ··· 1684 1639 * avoid further branches in the fast path. 1685 1640 */ 1686 1641 spin_lock_irq(&ctx->ctx_lock); 1687 - spin_lock(&req->head->lock); 1688 - if (!mask && !READ_ONCE(req->cancelled)) { 1689 - /* 1690 - * The request isn't actually ready to be completed yet. 1691 - * Reschedule completion if another wakeup came in. 1692 - */ 1693 - if (req->work_need_resched) { 1694 - schedule_work(&req->work); 1695 - req->work_need_resched = false; 1696 - } else { 1697 - req->work_scheduled = false; 1642 + if (poll_iocb_lock_wq(req)) { 1643 + if (!mask && !READ_ONCE(req->cancelled)) { 1644 + /* 1645 + * The request isn't actually ready to be completed yet. 1646 + * Reschedule completion if another wakeup came in. 1647 + */ 1648 + if (req->work_need_resched) { 1649 + schedule_work(&req->work); 1650 + req->work_need_resched = false; 1651 + } else { 1652 + req->work_scheduled = false; 1653 + } 1654 + poll_iocb_unlock_wq(req); 1655 + spin_unlock_irq(&ctx->ctx_lock); 1656 + return; 1698 1657 } 1699 - spin_unlock(&req->head->lock); 1700 - spin_unlock_irq(&ctx->ctx_lock); 1701 - return; 1702 - } 1703 - list_del_init(&req->wait.entry); 1704 - spin_unlock(&req->head->lock); 1658 + list_del_init(&req->wait.entry); 1659 + poll_iocb_unlock_wq(req); 1660 + } /* else, POLLFREE has freed the waitqueue, so we must complete */ 1705 1661 list_del_init(&iocb->ki_list); 1706 1662 iocb->ki_res.res = mangle_poll(mask); 1707 1663 spin_unlock_irq(&ctx->ctx_lock); ··· 1716 1670 struct aio_kiocb *aiocb = container_of(iocb, struct aio_kiocb, rw); 1717 1671 struct poll_iocb *req = &aiocb->poll; 1718 1672 1719 - spin_lock(&req->head->lock); 1720 - WRITE_ONCE(req->cancelled, true); 1721 - if (!req->work_scheduled) { 1722 - schedule_work(&aiocb->poll.work); 1723 - req->work_scheduled = true; 1724 - } 1725 - spin_unlock(&req->head->lock); 1673 + if (poll_iocb_lock_wq(req)) { 1674 + WRITE_ONCE(req->cancelled, true); 1675 + if (!req->work_scheduled) { 1676 + schedule_work(&aiocb->poll.work); 1677 + req->work_scheduled = true; 1678 + } 1679 + poll_iocb_unlock_wq(req); 1680 + } /* else, the request was force-cancelled by POLLFREE already */ 1726 1681 1727 1682 return 0; 1728 1683 } ··· 1775 1728 * 1776 1729 * Don't remove the request from the waitqueue here, as it might 1777 1730 * not actually be complete yet (we won't know until vfs_poll() 1778 - * is called), and we must not miss any wakeups. 1731 + * is called), and we must not miss any wakeups. POLLFREE is an 1732 + * exception to this; see below. 1779 1733 */ 1780 1734 if (req->work_scheduled) { 1781 1735 req->work_need_resched = true; 1782 1736 } else { 1783 1737 schedule_work(&req->work); 1784 1738 req->work_scheduled = true; 1739 + } 1740 + 1741 + /* 1742 + * If the waitqueue is being freed early but we can't complete 1743 + * the request inline, we have to tear down the request as best 1744 + * we can. That means immediately removing the request from its 1745 + * waitqueue and preventing all further accesses to the 1746 + * waitqueue via the request. We also need to schedule the 1747 + * completion work (done above). Also mark the request as 1748 + * cancelled, to potentially skip an unneeded call to ->poll(). 1749 + */ 1750 + if (mask & POLLFREE) { 1751 + WRITE_ONCE(req->cancelled, true); 1752 + list_del_init(&req->wait.entry); 1753 + 1754 + /* 1755 + * Careful: this *must* be the last step, since as soon 1756 + * as req->head is NULL'ed out, the request can be 1757 + * completed and freed, since aio_poll_complete_work() 1758 + * will no longer need to take the waitqueue lock. 1759 + */ 1760 + smp_store_release(&req->head, NULL); 1785 1761 } 1786 1762 } 1787 1763 return 1; ··· 1813 1743 struct aio_poll_table { 1814 1744 struct poll_table_struct pt; 1815 1745 struct aio_kiocb *iocb; 1746 + bool queued; 1816 1747 int error; 1817 1748 }; 1818 1749 ··· 1824 1753 struct aio_poll_table *pt = container_of(p, struct aio_poll_table, pt); 1825 1754 1826 1755 /* multiple wait queues per file are not supported */ 1827 - if (unlikely(pt->iocb->poll.head)) { 1756 + if (unlikely(pt->queued)) { 1828 1757 pt->error = -EINVAL; 1829 1758 return; 1830 1759 } 1831 1760 1761 + pt->queued = true; 1832 1762 pt->error = 0; 1833 1763 pt->iocb->poll.head = head; 1834 1764 add_wait_queue(head, &pt->iocb->poll.wait); ··· 1861 1789 apt.pt._qproc = aio_poll_queue_proc; 1862 1790 apt.pt._key = req->events; 1863 1791 apt.iocb = aiocb; 1792 + apt.queued = false; 1864 1793 apt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */ 1865 1794 1866 1795 /* initialized the list so that we can do list_empty checks */ ··· 1870 1797 1871 1798 mask = vfs_poll(req->file, &apt.pt) & req->events; 1872 1799 spin_lock_irq(&ctx->ctx_lock); 1873 - if (likely(req->head)) { 1874 - spin_lock(&req->head->lock); 1875 - if (list_empty(&req->wait.entry) || req->work_scheduled) { 1800 + if (likely(apt.queued)) { 1801 + bool on_queue = poll_iocb_lock_wq(req); 1802 + 1803 + if (!on_queue || req->work_scheduled) { 1876 1804 /* 1877 1805 * aio_poll_wake() already either scheduled the async 1878 1806 * completion work, or completed the request inline. ··· 1889 1815 } else if (cancel) { 1890 1816 /* Cancel if possible (may be too late though). */ 1891 1817 WRITE_ONCE(req->cancelled, true); 1892 - } else if (!list_empty(&req->wait.entry)) { 1818 + } else if (on_queue) { 1893 1819 /* 1894 1820 * Actually waiting for an event, so add the request to 1895 1821 * active_reqs so that it can be cancelled if needed. ··· 1897 1823 list_add_tail(&aiocb->ki_list, &ctx->active_reqs); 1898 1824 aiocb->ki_cancel = aio_poll_cancel; 1899 1825 } 1900 - spin_unlock(&req->head->lock); 1826 + if (on_queue) 1827 + poll_iocb_unlock_wq(req); 1901 1828 } 1902 1829 if (mask) { /* no async, we'd stolen it */ 1903 1830 aiocb->ki_res.res = mangle_poll(mask);
+1 -1
include/uapi/asm-generic/poll.h
··· 29 29 #define POLLRDHUP 0x2000 30 30 #endif 31 31 32 - #define POLLFREE (__force __poll_t)0x4000 /* currently only for epoll */ 32 + #define POLLFREE (__force __poll_t)0x4000 33 33 34 34 #define POLL_BUSY_LOOP (__force __poll_t)0x8000 35 35