Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

net: inet: Retire port only listening_hash

The listen sk is currently stored in two hash tables,
listening_hash (hashed by port) and lhash2 (hashed by port and address).

After commit 0ee58dad5b06 ("net: tcp6: prefer listeners bound to an address")
and commit d9fbc7f6431f ("net: tcp: prefer listeners bound to an address"),
the TCP-SYN lookup fast path does not use listening_hash.

The commit 05c0b35709c5 ("tcp: seq_file: Replace listening_hash with lhash2")
also moved the seq_file (/proc/net/tcp) iteration usage from
listening_hash to lhash2.

There are still a few listening_hash usages left.
One of them is inet_reuseport_add_sock() which uses the listening_hash
to search a listen sk during the listen() system call. This turns
out to be very slow on use cases that listen on many different
VIPs at a popular port (e.g. 443). [ On top of the slowness in
adding to the tail in the IPv6 case ]. The latter patch has a
selftest to demonstrate this case.

This patch takes this chance to move all remaining listening_hash
usages to lhash2 and then retire listening_hash.

Since most changes need to be done together, it is hard to cut
the listening_hash to lhash2 switch into small patches. The
changes in this patch is highlighted here for the review
purpose.

1. Because of the listening_hash removal, lhash2 can use the
sk->sk_nulls_node instead of the icsk->icsk_listen_portaddr_node.
This will also keep the sk_unhashed() check to work as is
after stop adding sk to listening_hash.

The union is removed from inet_listen_hashbucket because
only nulls_head is needed.

2. icsk->icsk_listen_portaddr_node and its helpers are removed.

3. The current lhash2 users needs to iterate with sk_nulls_node
instead of icsk_listen_portaddr_node.

One case is in the inet[6]_lhash2_lookup().

Another case is the seq_file iterator in tcp_ipv4.c.
One thing to note is sk_nulls_next() is needed
because the old inet_lhash2_for_each_icsk_continue()
does a "next" first before iterating.

4. Move the remaining listening_hash usage to lhash2

inet_reuseport_add_sock() which this series is
trying to improve.

inet_diag.c and mptcp_diag.c are the final two
remaining use cases and is moved to lhash2 now also.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

authored by

Martin KaFai Lau and committed by
Jakub Kicinski
cae3873c e8d00590

+26 -101
-2
include/net/inet_connection_sock.h
··· 66 66 * @icsk_ulp_ops Pluggable ULP control hook 67 67 * @icsk_ulp_data ULP private data 68 68 * @icsk_clean_acked Clean acked data hook 69 - * @icsk_listen_portaddr_node hash to the portaddr listener hashtable 70 69 * @icsk_ca_state: Congestion control state 71 70 * @icsk_retransmits: Number of unrecovered [RTO] timeouts 72 71 * @icsk_pending: Scheduled timer event ··· 95 96 const struct tcp_ulp_ops *icsk_ulp_ops; 96 97 void __rcu *icsk_ulp_data; 97 98 void (*icsk_clean_acked)(struct sock *sk, u32 acked_seq); 98 - struct hlist_node icsk_listen_portaddr_node; 99 99 unsigned int (*icsk_sync_mss)(struct sock *sk, u32 pmtu); 100 100 __u8 icsk_ca_state:5, 101 101 icsk_ca_initialized:1,
+1 -40
include/net/inet_hashtables.h
··· 111 111 #define LISTENING_NULLS_BASE (1U << 29) 112 112 struct inet_listen_hashbucket { 113 113 spinlock_t lock; 114 - union { 115 - struct hlist_head head; 116 - struct hlist_nulls_head nulls_head; 117 - }; 114 + struct hlist_nulls_head nulls_head; 118 115 }; 119 116 120 117 /* This is for listening sockets, thus all sockets which possess wildcards. */ ··· 139 142 /* The 2nd listener table hashed by local port and address */ 140 143 unsigned int lhash2_mask; 141 144 struct inet_listen_hashbucket *lhash2; 142 - 143 - /* All the above members are written once at bootup and 144 - * never written again _or_ are predominantly read-access. 145 - * 146 - * Now align to a new cache line as all the following members 147 - * might be often dirty. 148 - */ 149 - /* All sockets in TCP_LISTEN state will be in listening_hash. 150 - * This is the only table where wildcard'd TCP sockets can 151 - * exist. listening_hash is only hashed by local port number. 152 - * If lhash2 is initialized, the same socket will also be hashed 153 - * to lhash2 by port and address. 154 - */ 155 - struct inet_listen_hashbucket listening_hash[INET_LHTABLE_SIZE] 156 - ____cacheline_aligned_in_smp; 157 145 }; 158 - 159 - #define inet_lhash2_for_each_icsk_continue(__icsk) \ 160 - hlist_for_each_entry_continue(__icsk, icsk_listen_portaddr_node) 161 - 162 - #define inet_lhash2_for_each_icsk(__icsk, list) \ 163 - hlist_for_each_entry(__icsk, list, icsk_listen_portaddr_node) 164 - 165 - #define inet_lhash2_for_each_icsk_rcu(__icsk, list) \ 166 - hlist_for_each_entry_rcu(__icsk, list, icsk_listen_portaddr_node) 167 146 168 147 static inline struct inet_listen_hashbucket * 169 148 inet_lhash2_bucket(struct inet_hashinfo *h, u32 hash) ··· 202 229 void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, 203 230 const unsigned short snum); 204 231 205 - /* These can have wildcards, don't try too hard. */ 206 - static inline u32 inet_lhashfn(const struct net *net, const unsigned short num) 207 - { 208 - return (num + net_hash_mix(net)) & (INET_LHTABLE_SIZE - 1); 209 - } 210 - 211 - static inline int inet_sk_listen_hashfn(const struct sock *sk) 212 - { 213 - return inet_lhashfn(sock_net(sk), inet_sk(sk)->inet_num); 214 - } 215 - 216 232 /* Caller must disable local BH processing. */ 217 233 int __inet_inherit_port(const struct sock *sk, struct sock *child); 218 234 219 235 void inet_put_port(struct sock *sk); 220 236 221 - void inet_hashinfo_init(struct inet_hashinfo *h); 222 237 void inet_hashinfo2_init(struct inet_hashinfo *h, const char *name, 223 238 unsigned long numentries, int scale, 224 239 unsigned long low_limit,
-1
net/dccp/proto.c
··· 1110 1110 1111 1111 BUILD_BUG_ON(sizeof(struct dccp_skb_cb) > 1112 1112 sizeof_field(struct sk_buff, cb)); 1113 - inet_hashinfo_init(&dccp_hashinfo); 1114 1113 rc = inet_hashinfo2_init_mod(&dccp_hashinfo); 1115 1114 if (rc) 1116 1115 goto out_fail;
+3 -2
net/ipv4/inet_diag.c
··· 1028 1028 if (!(idiag_states & TCPF_LISTEN) || r->id.idiag_dport) 1029 1029 goto skip_listen_ht; 1030 1030 1031 - for (i = s_i; i < INET_LHTABLE_SIZE; i++) { 1031 + for (i = s_i; i <= hashinfo->lhash2_mask; i++) { 1032 1032 struct inet_listen_hashbucket *ilb; 1033 1033 struct hlist_nulls_node *node; 1034 1034 1035 1035 num = 0; 1036 - ilb = &hashinfo->listening_hash[i]; 1036 + ilb = &hashinfo->lhash2[i]; 1037 + 1037 1038 spin_lock(&ilb->lock); 1038 1039 sk_nulls_for_each(sk, node, &ilb->nulls_head) { 1039 1040 struct inet_sock *inet = inet_sk(sk);
+9 -38
net/ipv4/inet_hashtables.c
··· 246 246 const __be32 daddr, const unsigned short hnum, 247 247 const int dif, const int sdif) 248 248 { 249 - struct inet_connection_sock *icsk; 250 249 struct sock *sk, *result = NULL; 250 + struct hlist_nulls_node *node; 251 251 int score, hiscore = 0; 252 252 253 - inet_lhash2_for_each_icsk_rcu(icsk, &ilb2->head) { 254 - sk = (struct sock *)icsk; 253 + sk_nulls_for_each_rcu(sk, node, &ilb2->nulls_head) { 255 254 score = compute_score(sk, net, hnum, daddr, dif, sdif); 256 255 if (score > hiscore) { 257 256 result = lookup_reuseport(net, sk, skb, doff, ··· 597 598 { 598 599 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 599 600 struct inet_listen_hashbucket *ilb2; 600 - struct inet_listen_hashbucket *ilb; 601 601 int err = 0; 602 602 603 603 if (sk->sk_state != TCP_LISTEN) { ··· 606 608 return 0; 607 609 } 608 610 WARN_ON(!sk_unhashed(sk)); 609 - ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; 610 611 ilb2 = inet_lhash2_bucket_sk(hashinfo, sk); 611 612 612 - spin_lock(&ilb->lock); 613 613 spin_lock(&ilb2->lock); 614 614 if (sk->sk_reuseport) { 615 - err = inet_reuseport_add_sock(sk, ilb); 615 + err = inet_reuseport_add_sock(sk, ilb2); 616 616 if (err) 617 617 goto unlock; 618 618 } 619 619 if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport && 620 - sk->sk_family == AF_INET6) { 621 - hlist_add_tail_rcu(&inet_csk(sk)->icsk_listen_portaddr_node, 622 - &ilb2->head); 623 - __sk_nulls_add_node_tail_rcu(sk, &ilb->nulls_head); 624 - } else { 625 - hlist_add_head_rcu(&inet_csk(sk)->icsk_listen_portaddr_node, 626 - &ilb2->head); 627 - __sk_nulls_add_node_rcu(sk, &ilb->nulls_head); 628 - } 620 + sk->sk_family == AF_INET6) 621 + __sk_nulls_add_node_tail_rcu(sk, &ilb2->nulls_head); 622 + else 623 + __sk_nulls_add_node_rcu(sk, &ilb2->nulls_head); 629 624 sock_set_flag(sk, SOCK_RCU_FREE); 630 625 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 631 626 unlock: 632 627 spin_unlock(&ilb2->lock); 633 - spin_unlock(&ilb->lock); 634 628 635 629 return err; 636 630 } ··· 648 658 649 659 if (sk->sk_state == TCP_LISTEN) { 650 660 struct inet_listen_hashbucket *ilb2; 651 - struct inet_listen_hashbucket *ilb; 652 661 653 - ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; 654 662 ilb2 = inet_lhash2_bucket_sk(hashinfo, sk); 655 663 /* Don't disable bottom halves while acquiring the lock to 656 664 * avoid circular locking dependency on PREEMPT_RT. 657 665 */ 658 - spin_lock(&ilb->lock); 659 666 spin_lock(&ilb2->lock); 660 667 if (sk_unhashed(sk)) { 661 668 spin_unlock(&ilb2->lock); 662 - spin_unlock(&ilb->lock); 663 669 return; 664 670 } 665 671 666 672 if (rcu_access_pointer(sk->sk_reuseport_cb)) 667 673 reuseport_stop_listen_sock(sk); 668 674 669 - hlist_del_init_rcu(&inet_csk(sk)->icsk_listen_portaddr_node); 670 675 __sk_nulls_del_node_init_rcu(sk); 671 676 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 672 677 spin_unlock(&ilb2->lock); 673 - spin_unlock(&ilb->lock); 674 678 } else { 675 679 spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash); 676 680 ··· 832 848 } 833 849 EXPORT_SYMBOL_GPL(inet_hash_connect); 834 850 835 - void inet_hashinfo_init(struct inet_hashinfo *h) 836 - { 837 - int i; 838 - 839 - for (i = 0; i < INET_LHTABLE_SIZE; i++) { 840 - spin_lock_init(&h->listening_hash[i].lock); 841 - INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].nulls_head, 842 - i + LISTENING_NULLS_BASE); 843 - } 844 - 845 - h->lhash2 = NULL; 846 - } 847 - EXPORT_SYMBOL_GPL(inet_hashinfo_init); 848 - 849 851 static void init_hashinfo_lhash2(struct inet_hashinfo *h) 850 852 { 851 853 int i; 852 854 853 855 for (i = 0; i <= h->lhash2_mask; i++) { 854 856 spin_lock_init(&h->lhash2[i].lock); 855 - INIT_HLIST_HEAD(&h->lhash2[i].head); 857 + INIT_HLIST_NULLS_HEAD(&h->lhash2[i].nulls_head, 858 + i + LISTENING_NULLS_BASE); 856 859 } 857 860 } 858 861
-1
net/ipv4/tcp.c
··· 4595 4595 timer_setup(&tcp_orphan_timer, tcp_orphan_update, TIMER_DEFERRABLE); 4596 4596 mod_timer(&tcp_orphan_timer, jiffies + TCP_ORPHAN_TIMER_PERIOD); 4597 4597 4598 - inet_hashinfo_init(&tcp_hashinfo); 4599 4598 inet_hashinfo2_init(&tcp_hashinfo, "tcp_listen_portaddr_hash", 4600 4599 thash_entries, 21, /* one slot per 2 MB*/ 4601 4600 0, 64 * 1024);
+9 -12
net/ipv4/tcp_ipv4.c
··· 2283 2283 st->offset = 0; 2284 2284 for (; st->bucket <= tcp_hashinfo.lhash2_mask; st->bucket++) { 2285 2285 struct inet_listen_hashbucket *ilb2; 2286 - struct inet_connection_sock *icsk; 2286 + struct hlist_nulls_node *node; 2287 2287 struct sock *sk; 2288 2288 2289 2289 ilb2 = &tcp_hashinfo.lhash2[st->bucket]; 2290 - if (hlist_empty(&ilb2->head)) 2290 + if (hlist_nulls_empty(&ilb2->nulls_head)) 2291 2291 continue; 2292 2292 2293 2293 spin_lock(&ilb2->lock); 2294 - inet_lhash2_for_each_icsk(icsk, &ilb2->head) { 2295 - sk = (struct sock *)icsk; 2294 + sk_nulls_for_each(sk, node, &ilb2->nulls_head) { 2296 2295 if (seq_sk_match(seq, sk)) 2297 2296 return sk; 2298 2297 } ··· 2310 2311 { 2311 2312 struct tcp_iter_state *st = seq->private; 2312 2313 struct inet_listen_hashbucket *ilb2; 2313 - struct inet_connection_sock *icsk; 2314 + struct hlist_nulls_node *node; 2314 2315 struct sock *sk = cur; 2315 2316 2316 2317 ++st->num; 2317 2318 ++st->offset; 2318 2319 2319 - icsk = inet_csk(sk); 2320 - inet_lhash2_for_each_icsk_continue(icsk) { 2321 - sk = (struct sock *)icsk; 2320 + sk = sk_nulls_next(sk); 2321 + sk_nulls_for_each_from(sk, node) { 2322 2322 if (seq_sk_match(seq, sk)) 2323 2323 return sk; 2324 2324 } ··· 2726 2728 { 2727 2729 struct bpf_tcp_iter_state *iter = seq->private; 2728 2730 struct tcp_iter_state *st = &iter->state; 2729 - struct inet_connection_sock *icsk; 2731 + struct hlist_nulls_node *node; 2730 2732 unsigned int expected = 1; 2731 2733 struct sock *sk; 2732 2734 2733 2735 sock_hold(start_sk); 2734 2736 iter->batch[iter->end_sk++] = start_sk; 2735 2737 2736 - icsk = inet_csk(start_sk); 2737 - inet_lhash2_for_each_icsk_continue(icsk) { 2738 - sk = (struct sock *)icsk; 2738 + sk = sk_nulls_next(start_sk); 2739 + sk_nulls_for_each_from(sk, node) { 2739 2740 if (seq_sk_match(seq, sk)) { 2740 2741 if (iter->end_sk < iter->max_sk) { 2741 2742 sock_hold(sk);
+2 -3
net/ipv6/inet6_hashtables.c
··· 138 138 const __be16 sport, const struct in6_addr *daddr, 139 139 const unsigned short hnum, const int dif, const int sdif) 140 140 { 141 - struct inet_connection_sock *icsk; 142 141 struct sock *sk, *result = NULL; 142 + struct hlist_nulls_node *node; 143 143 int score, hiscore = 0; 144 144 145 - inet_lhash2_for_each_icsk_rcu(icsk, &ilb2->head) { 146 - sk = (struct sock *)icsk; 145 + sk_nulls_for_each_rcu(sk, node, &ilb2->nulls_head) { 147 146 score = compute_score(sk, net, hnum, daddr, dif, sdif); 148 147 if (score > hiscore) { 149 148 result = lookup_reuseport(net, sk, skb, doff,
+2 -2
net/mptcp/mptcp_diag.c
··· 83 83 struct net *net = sock_net(skb->sk); 84 84 int i; 85 85 86 - for (i = diag_ctx->l_slot; i < INET_LHTABLE_SIZE; i++) { 86 + for (i = diag_ctx->l_slot; i <= tcp_hashinfo.lhash2_mask; i++) { 87 87 struct inet_listen_hashbucket *ilb; 88 88 struct hlist_nulls_node *node; 89 89 struct sock *sk; 90 90 int num = 0; 91 91 92 - ilb = &tcp_hashinfo.listening_hash[i]; 92 + ilb = &tcp_hashinfo.lhash2[i]; 93 93 94 94 rcu_read_lock(); 95 95 spin_lock(&ilb->lock);