Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

tproxy: fix hash locking issue when using port redirection in __inet_inherit_port()

When __inet_inherit_port() is called on a tproxy connection the wrong locks are
held for the inet_bind_bucket it is added to. __inet_inherit_port() made an
implicit assumption that the listener's port number (and thus its bind bucket).
Unfortunately, if you're using the TPROXY target to redirect skbs to a
transparent proxy that assumption is not true anymore and things break.

This patch adds code to __inet_inherit_port() so that it can handle this case
by looking up or creating a new bind bucket for the child socket and updates
callers of __inet_inherit_port() to gracefully handle __inet_inherit_port()
failing.

Reported by and original patch from Stephen Buck <stephen.buck@exinda.com>.
See http://marc.info/?t=128169268200001&r=1&w=2 for the original discussion.

Signed-off-by: KOVACS Krisztian <hidden@balabit.hu>
Signed-off-by: Patrick McHardy <kaber@trash.net>

authored by

Balazs Scheidler and committed by
Patrick McHardy
093d2823 6006db84

+56 -16
+1 -1
include/net/inet_hashtables.h
··· 245 245 } 246 246 247 247 /* Caller must disable local BH processing. */ 248 - extern void __inet_inherit_port(struct sock *sk, struct sock *child); 248 + extern int __inet_inherit_port(struct sock *sk, struct sock *child); 249 249 250 250 extern void inet_put_port(struct sock *sk); 251 251
+7 -3
net/dccp/ipv4.c
··· 392 392 393 393 newsk = dccp_create_openreq_child(sk, req, skb); 394 394 if (newsk == NULL) 395 - goto exit; 395 + goto exit_nonewsk; 396 396 397 397 sk_setup_caps(newsk, dst); 398 398 ··· 409 409 410 410 dccp_sync_mss(newsk, dst_mtu(dst)); 411 411 412 + if (__inet_inherit_port(sk, newsk) < 0) { 413 + sock_put(newsk); 414 + goto exit; 415 + } 412 416 __inet_hash_nolisten(newsk, NULL); 413 - __inet_inherit_port(sk, newsk); 414 417 415 418 return newsk; 416 419 417 420 exit_overflow: 418 421 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 422 + exit_nonewsk: 423 + dst_release(dst); 419 424 exit: 420 425 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); 421 - dst_release(dst); 422 426 return NULL; 423 427 } 424 428
+7 -3
net/dccp/ipv6.c
··· 564 564 565 565 newsk = dccp_create_openreq_child(sk, req, skb); 566 566 if (newsk == NULL) 567 - goto out; 567 + goto out_nonewsk; 568 568 569 569 /* 570 570 * No need to charge this sock to the relevant IPv6 refcnt debug socks ··· 632 632 newinet->inet_daddr = newinet->inet_saddr = LOOPBACK4_IPV6; 633 633 newinet->inet_rcv_saddr = LOOPBACK4_IPV6; 634 634 635 + if (__inet_inherit_port(sk, newsk) < 0) { 636 + sock_put(newsk); 637 + goto out; 638 + } 635 639 __inet6_hash(newsk, NULL); 636 - __inet_inherit_port(sk, newsk); 637 640 638 641 return newsk; 639 642 640 643 out_overflow: 641 644 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 645 + out_nonewsk: 646 + dst_release(dst); 642 647 out: 643 648 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); 644 649 if (opt != NULL && opt != np->opt) 645 650 sock_kfree_s(sk, opt, opt->tot_len); 646 - dst_release(dst); 647 651 return NULL; 648 652 } 649 653
+26 -2
net/ipv4/inet_hashtables.c
··· 101 101 } 102 102 EXPORT_SYMBOL(inet_put_port); 103 103 104 - void __inet_inherit_port(struct sock *sk, struct sock *child) 104 + int __inet_inherit_port(struct sock *sk, struct sock *child) 105 105 { 106 106 struct inet_hashinfo *table = sk->sk_prot->h.hashinfo; 107 - const int bhash = inet_bhashfn(sock_net(sk), inet_sk(child)->inet_num, 107 + unsigned short port = inet_sk(child)->inet_num; 108 + const int bhash = inet_bhashfn(sock_net(sk), port, 108 109 table->bhash_size); 109 110 struct inet_bind_hashbucket *head = &table->bhash[bhash]; 110 111 struct inet_bind_bucket *tb; 111 112 112 113 spin_lock(&head->lock); 113 114 tb = inet_csk(sk)->icsk_bind_hash; 115 + if (tb->port != port) { 116 + /* NOTE: using tproxy and redirecting skbs to a proxy 117 + * on a different listener port breaks the assumption 118 + * that the listener socket's icsk_bind_hash is the same 119 + * as that of the child socket. We have to look up or 120 + * create a new bind bucket for the child here. */ 121 + struct hlist_node *node; 122 + inet_bind_bucket_for_each(tb, node, &head->chain) { 123 + if (net_eq(ib_net(tb), sock_net(sk)) && 124 + tb->port == port) 125 + break; 126 + } 127 + if (!node) { 128 + tb = inet_bind_bucket_create(table->bind_bucket_cachep, 129 + sock_net(sk), head, port); 130 + if (!tb) { 131 + spin_unlock(&head->lock); 132 + return -ENOMEM; 133 + } 134 + } 135 + } 114 136 sk_add_bind_node(child, &tb->owners); 115 137 inet_csk(child)->icsk_bind_hash = tb; 116 138 spin_unlock(&head->lock); 139 + 140 + return 0; 117 141 } 118 142 EXPORT_SYMBOL_GPL(__inet_inherit_port); 119 143
+7 -3
net/ipv4/tcp_ipv4.c
··· 1422 1422 1423 1423 newsk = tcp_create_openreq_child(sk, req, skb); 1424 1424 if (!newsk) 1425 - goto exit; 1425 + goto exit_nonewsk; 1426 1426 1427 1427 newsk->sk_gso_type = SKB_GSO_TCPV4; 1428 1428 sk_setup_caps(newsk, dst); ··· 1469 1469 } 1470 1470 #endif 1471 1471 1472 + if (__inet_inherit_port(sk, newsk) < 0) { 1473 + sock_put(newsk); 1474 + goto exit; 1475 + } 1472 1476 __inet_hash_nolisten(newsk, NULL); 1473 - __inet_inherit_port(sk, newsk); 1474 1477 1475 1478 return newsk; 1476 1479 1477 1480 exit_overflow: 1478 1481 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 1482 + exit_nonewsk: 1483 + dst_release(dst); 1479 1484 exit: 1480 1485 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); 1481 - dst_release(dst); 1482 1486 return NULL; 1483 1487 } 1484 1488 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
+8 -4
net/ipv6/tcp_ipv6.c
··· 1409 1409 1410 1410 newsk = tcp_create_openreq_child(sk, req, skb); 1411 1411 if (newsk == NULL) 1412 - goto out; 1412 + goto out_nonewsk; 1413 1413 1414 1414 /* 1415 1415 * No need to charge this sock to the relevant IPv6 refcnt debug socks ··· 1497 1497 } 1498 1498 #endif 1499 1499 1500 + if (__inet_inherit_port(sk, newsk) < 0) { 1501 + sock_put(newsk); 1502 + goto out; 1503 + } 1500 1504 __inet6_hash(newsk, NULL); 1501 - __inet_inherit_port(sk, newsk); 1502 1505 1503 1506 return newsk; 1504 1507 1505 1508 out_overflow: 1506 1509 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 1507 - out: 1508 - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); 1510 + out_nonewsk: 1509 1511 if (opt && opt != np->opt) 1510 1512 sock_kfree_s(sk, opt, opt->tot_len); 1511 1513 dst_release(dst); 1514 + out: 1515 + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); 1512 1516 return NULL; 1513 1517 } 1514 1518