Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

unix: avoid use-after-free in ep_remove_wait_queue

Rainer Weikusat <rweikusat@mobileactivedefense.com> writes:
An AF_UNIX datagram socket being the client in an n:1 association with
some server socket is only allowed to send messages to the server if the
receive queue of this socket contains at most sk_max_ack_backlog
datagrams. This implies that prospective writers might be forced to go
to sleep despite none of the message presently enqueued on the server
receive queue were sent by them. In order to ensure that these will be
woken up once space becomes again available, the present unix_dgram_poll
routine does a second sock_poll_wait call with the peer_wait wait queue
of the server socket as queue argument (unix_dgram_recvmsg does a wake
up on this queue after a datagram was received). This is inherently
problematic because the server socket is only guaranteed to remain alive
for as long as the client still holds a reference to it. In case the
connection is dissolved via connect or by the dead peer detection logic
in unix_dgram_sendmsg, the server socket may be freed despite "the
polling mechanism" (in particular, epoll) still has a pointer to the
corresponding peer_wait queue. There's no way to forcibly deregister a
wait queue with epoll.

Based on an idea by Jason Baron, the patch below changes the code such
that a wait_queue_t belonging to the client socket is enqueued on the
peer_wait queue of the server whenever the peer receive queue full
condition is detected by either a sendmsg or a poll. A wake up on the
peer queue is then relayed to the ordinary wait queue of the client
socket via wake function. The connection to the peer wait queue is again
dissolved if either a wake up is about to be relayed or the client
socket reconnects or a dead peer is detected or the client socket is
itself closed. This enables removing the second sock_poll_wait from
unix_dgram_poll, thus avoiding the use-after-free, while still ensuring
that no blocked writer sleeps forever.

Signed-off-by: Rainer Weikusat <rweikusat@mobileactivedefense.com>
Fixes: ec0d215f9420 ("af_unix: fix 'poll for write'/connected DGRAM sockets")
Reviewed-by: Jason Baron <jbaron@akamai.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

Rainer Weikusat and committed by
David S. Miller
7d267278 3b13758f

+165 -19
+1
include/net/af_unix.h
··· 62 62 #define UNIX_GC_CANDIDATE 0 63 63 #define UNIX_GC_MAYBE_CYCLE 1 64 64 struct socket_wq peer_wq; 65 + wait_queue_t peer_wake; 65 66 }; 66 67 67 68 static inline struct unix_sock *unix_sk(const struct sock *sk)
+164 -19
net/unix/af_unix.c
··· 326 326 return s; 327 327 } 328 328 329 + /* Support code for asymmetrically connected dgram sockets 330 + * 331 + * If a datagram socket is connected to a socket not itself connected 332 + * to the first socket (eg, /dev/log), clients may only enqueue more 333 + * messages if the present receive queue of the server socket is not 334 + * "too large". This means there's a second writeability condition 335 + * poll and sendmsg need to test. The dgram recv code will do a wake 336 + * up on the peer_wait wait queue of a socket upon reception of a 337 + * datagram which needs to be propagated to sleeping would-be writers 338 + * since these might not have sent anything so far. This can't be 339 + * accomplished via poll_wait because the lifetime of the server 340 + * socket might be less than that of its clients if these break their 341 + * association with it or if the server socket is closed while clients 342 + * are still connected to it and there's no way to inform "a polling 343 + * implementation" that it should let go of a certain wait queue 344 + * 345 + * In order to propagate a wake up, a wait_queue_t of the client 346 + * socket is enqueued on the peer_wait queue of the server socket 347 + * whose wake function does a wake_up on the ordinary client socket 348 + * wait queue. This connection is established whenever a write (or 349 + * poll for write) hit the flow control condition and broken when the 350 + * association to the server socket is dissolved or after a wake up 351 + * was relayed. 352 + */ 353 + 354 + static int unix_dgram_peer_wake_relay(wait_queue_t *q, unsigned mode, int flags, 355 + void *key) 356 + { 357 + struct unix_sock *u; 358 + wait_queue_head_t *u_sleep; 359 + 360 + u = container_of(q, struct unix_sock, peer_wake); 361 + 362 + __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait, 363 + q); 364 + u->peer_wake.private = NULL; 365 + 366 + /* relaying can only happen while the wq still exists */ 367 + u_sleep = sk_sleep(&u->sk); 368 + if (u_sleep) 369 + wake_up_interruptible_poll(u_sleep, key); 370 + 371 + return 0; 372 + } 373 + 374 + static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other) 375 + { 376 + struct unix_sock *u, *u_other; 377 + int rc; 378 + 379 + u = unix_sk(sk); 380 + u_other = unix_sk(other); 381 + rc = 0; 382 + spin_lock(&u_other->peer_wait.lock); 383 + 384 + if (!u->peer_wake.private) { 385 + u->peer_wake.private = other; 386 + __add_wait_queue(&u_other->peer_wait, &u->peer_wake); 387 + 388 + rc = 1; 389 + } 390 + 391 + spin_unlock(&u_other->peer_wait.lock); 392 + return rc; 393 + } 394 + 395 + static void unix_dgram_peer_wake_disconnect(struct sock *sk, 396 + struct sock *other) 397 + { 398 + struct unix_sock *u, *u_other; 399 + 400 + u = unix_sk(sk); 401 + u_other = unix_sk(other); 402 + spin_lock(&u_other->peer_wait.lock); 403 + 404 + if (u->peer_wake.private == other) { 405 + __remove_wait_queue(&u_other->peer_wait, &u->peer_wake); 406 + u->peer_wake.private = NULL; 407 + } 408 + 409 + spin_unlock(&u_other->peer_wait.lock); 410 + } 411 + 412 + static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk, 413 + struct sock *other) 414 + { 415 + unix_dgram_peer_wake_disconnect(sk, other); 416 + wake_up_interruptible_poll(sk_sleep(sk), 417 + POLLOUT | 418 + POLLWRNORM | 419 + POLLWRBAND); 420 + } 421 + 422 + /* preconditions: 423 + * - unix_peer(sk) == other 424 + * - association is stable 425 + */ 426 + static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other) 427 + { 428 + int connected; 429 + 430 + connected = unix_dgram_peer_wake_connect(sk, other); 431 + 432 + if (unix_recvq_full(other)) 433 + return 1; 434 + 435 + if (connected) 436 + unix_dgram_peer_wake_disconnect(sk, other); 437 + 438 + return 0; 439 + } 440 + 329 441 static int unix_writable(const struct sock *sk) 330 442 { 331 443 return sk->sk_state != TCP_LISTEN && ··· 543 431 skpair->sk_state_change(skpair); 544 432 sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP); 545 433 } 434 + 435 + unix_dgram_peer_wake_disconnect(sk, skpair); 546 436 sock_put(skpair); /* It may now die */ 547 437 unix_peer(sk) = NULL; 548 438 } ··· 780 666 INIT_LIST_HEAD(&u->link); 781 667 mutex_init(&u->readlock); /* single task reading lock */ 782 668 init_waitqueue_head(&u->peer_wait); 669 + init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay); 783 670 unix_insert_socket(unix_sockets_unbound(sk), sk); 784 671 out: 785 672 if (sk == NULL) ··· 1148 1033 if (unix_peer(sk)) { 1149 1034 struct sock *old_peer = unix_peer(sk); 1150 1035 unix_peer(sk) = other; 1036 + unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer); 1037 + 1151 1038 unix_state_double_unlock(sk, other); 1152 1039 1153 1040 if (other != old_peer) ··· 1589 1472 struct scm_cookie scm; 1590 1473 int max_level; 1591 1474 int data_len = 0; 1475 + int sk_locked; 1592 1476 1593 1477 wait_for_unix_gc(); 1594 1478 err = scm_send(sock, msg, &scm, false); ··· 1668 1550 goto out_free; 1669 1551 } 1670 1552 1553 + sk_locked = 0; 1671 1554 unix_state_lock(other); 1555 + restart_locked: 1672 1556 err = -EPERM; 1673 1557 if (!unix_may_send(sk, other)) 1674 1558 goto out_unlock; 1675 1559 1676 - if (sock_flag(other, SOCK_DEAD)) { 1560 + if (unlikely(sock_flag(other, SOCK_DEAD))) { 1677 1561 /* 1678 1562 * Check with 1003.1g - what should 1679 1563 * datagram error ··· 1683 1563 unix_state_unlock(other); 1684 1564 sock_put(other); 1685 1565 1566 + if (!sk_locked) 1567 + unix_state_lock(sk); 1568 + 1686 1569 err = 0; 1687 - unix_state_lock(sk); 1688 1570 if (unix_peer(sk) == other) { 1689 1571 unix_peer(sk) = NULL; 1572 + unix_dgram_peer_wake_disconnect_wakeup(sk, other); 1573 + 1690 1574 unix_state_unlock(sk); 1691 1575 1692 1576 unix_dgram_disconnected(sk, other); ··· 1716 1592 goto out_unlock; 1717 1593 } 1718 1594 1719 - if (unix_peer(other) != sk && unix_recvq_full(other)) { 1720 - if (!timeo) { 1595 + if (unlikely(unix_peer(other) != sk && unix_recvq_full(other))) { 1596 + if (timeo) { 1597 + timeo = unix_wait_for_peer(other, timeo); 1598 + 1599 + err = sock_intr_errno(timeo); 1600 + if (signal_pending(current)) 1601 + goto out_free; 1602 + 1603 + goto restart; 1604 + } 1605 + 1606 + if (!sk_locked) { 1607 + unix_state_unlock(other); 1608 + unix_state_double_lock(sk, other); 1609 + } 1610 + 1611 + if (unix_peer(sk) != other || 1612 + unix_dgram_peer_wake_me(sk, other)) { 1721 1613 err = -EAGAIN; 1614 + sk_locked = 1; 1722 1615 goto out_unlock; 1723 1616 } 1724 1617 1725 - timeo = unix_wait_for_peer(other, timeo); 1726 - 1727 - err = sock_intr_errno(timeo); 1728 - if (signal_pending(current)) 1729 - goto out_free; 1730 - 1731 - goto restart; 1618 + if (!sk_locked) { 1619 + sk_locked = 1; 1620 + goto restart_locked; 1621 + } 1732 1622 } 1623 + 1624 + if (unlikely(sk_locked)) 1625 + unix_state_unlock(sk); 1733 1626 1734 1627 if (sock_flag(other, SOCK_RCVTSTAMP)) 1735 1628 __net_timestamp(skb); ··· 1761 1620 return len; 1762 1621 1763 1622 out_unlock: 1623 + if (sk_locked) 1624 + unix_state_unlock(sk); 1764 1625 unix_state_unlock(other); 1765 1626 out_free: 1766 1627 kfree_skb(skb); ··· 2619 2476 return mask; 2620 2477 2621 2478 writable = unix_writable(sk); 2622 - other = unix_peer_get(sk); 2623 - if (other) { 2624 - if (unix_peer(other) != sk) { 2625 - sock_poll_wait(file, &unix_sk(other)->peer_wait, wait); 2626 - if (unix_recvq_full(other)) 2627 - writable = 0; 2628 - } 2629 - sock_put(other); 2479 + if (writable) { 2480 + unix_state_lock(sk); 2481 + 2482 + other = unix_peer(sk); 2483 + if (other && unix_peer(other) != sk && 2484 + unix_recvq_full(other) && 2485 + unix_dgram_peer_wake_me(sk, other)) 2486 + writable = 0; 2487 + 2488 + unix_state_unlock(sk); 2630 2489 } 2631 2490 2632 2491 if (writable)