Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

tcp: seq_file: Replace listening_hash with lhash2

This patch moves the tcp seq_file iteration on listeners
from the port only listening_hash to the port+addr lhash2.

When iterating from the bpf iter, the next patch will need to
lock the socket such that the bpf iter can call setsockopt (e.g. to
change the TCP_CONGESTION). To avoid locking the bucket and then locking
the sock, the bpf iter will first batch some sockets from the same bucket
and then unlock the bucket. If the bucket size is small (which
usually is), it is easier to batch the whole bucket such that it is less
likely to miss a setsockopt on a socket due to changes in the bucket.

However, the port only listening_hash could have many listeners
hashed to a bucket (e.g. many individual VIP(s):443 and also
multiple by the number of SO_REUSEPORT). We have seen bucket size in
tens of thousands range. Also, the chance of having changes
in some popular port buckets (e.g. 443) is also high.

The port+addr lhash2 was introduced to solve this large listener bucket
issue. Also, the listening_hash usage has already been replaced with
lhash2 in the fast path inet[6]_lookup_listener(). This patch follows
the same direction on moving to lhash2 and iterates the lhash2
instead of listening_hash.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Acked-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20210701200606.1035783-1-kafai@fb.com

authored by

Martin KaFai Lau and committed by
Andrii Nakryiko
05c0b357 b72acf45

+24 -17
+6
include/net/inet_hashtables.h
··· 160 160 ____cacheline_aligned_in_smp; 161 161 }; 162 162 163 + #define inet_lhash2_for_each_icsk_continue(__icsk) \ 164 + hlist_for_each_entry_continue(__icsk, icsk_listen_portaddr_node) 165 + 166 + #define inet_lhash2_for_each_icsk(__icsk, list) \ 167 + hlist_for_each_entry(__icsk, list, icsk_listen_portaddr_node) 168 + 163 169 #define inet_lhash2_for_each_icsk_rcu(__icsk, list) \ 164 170 hlist_for_each_entry_rcu(__icsk, list, icsk_listen_portaddr_node) 165 171
+18 -17
net/ipv4/tcp_ipv4.c
··· 2296 2296 struct tcp_iter_state *st = seq->private; 2297 2297 2298 2298 st->offset = 0; 2299 - for (; st->bucket < INET_LHTABLE_SIZE; st->bucket++) { 2300 - struct inet_listen_hashbucket *ilb; 2301 - struct hlist_nulls_node *node; 2299 + for (; st->bucket <= tcp_hashinfo.lhash2_mask; st->bucket++) { 2300 + struct inet_listen_hashbucket *ilb2; 2301 + struct inet_connection_sock *icsk; 2302 2302 struct sock *sk; 2303 2303 2304 - ilb = &tcp_hashinfo.listening_hash[st->bucket]; 2305 - if (hlist_nulls_empty(&ilb->nulls_head)) 2304 + ilb2 = &tcp_hashinfo.lhash2[st->bucket]; 2305 + if (hlist_empty(&ilb2->head)) 2306 2306 continue; 2307 2307 2308 - spin_lock(&ilb->lock); 2309 - sk_nulls_for_each(sk, node, &ilb->nulls_head) { 2308 + spin_lock(&ilb2->lock); 2309 + inet_lhash2_for_each_icsk(icsk, &ilb2->head) { 2310 + sk = (struct sock *)icsk; 2310 2311 if (seq_sk_match(seq, sk)) 2311 2312 return sk; 2312 2313 } 2313 - spin_unlock(&ilb->lock); 2314 + spin_unlock(&ilb2->lock); 2314 2315 } 2315 2316 2316 2317 return NULL; ··· 2325 2324 static void *listening_get_next(struct seq_file *seq, void *cur) 2326 2325 { 2327 2326 struct tcp_iter_state *st = seq->private; 2328 - struct inet_listen_hashbucket *ilb; 2329 - struct hlist_nulls_node *node; 2327 + struct inet_listen_hashbucket *ilb2; 2328 + struct inet_connection_sock *icsk; 2330 2329 struct sock *sk = cur; 2331 2330 2332 2331 ++st->num; 2333 2332 ++st->offset; 2334 2333 2335 - sk = sk_nulls_next(sk); 2336 - 2337 - sk_nulls_for_each_from(sk, node) { 2334 + icsk = inet_csk(sk); 2335 + inet_lhash2_for_each_icsk_continue(icsk) { 2336 + sk = (struct sock *)icsk; 2338 2337 if (seq_sk_match(seq, sk)) 2339 2338 return sk; 2340 2339 } 2341 2340 2342 - ilb = &tcp_hashinfo.listening_hash[st->bucket]; 2343 - spin_unlock(&ilb->lock); 2341 + ilb2 = &tcp_hashinfo.lhash2[st->bucket]; 2342 + spin_unlock(&ilb2->lock); 2344 2343 ++st->bucket; 2345 2344 return listening_get_first(seq); 2346 2345 } ··· 2457 2456 2458 2457 switch (st->state) { 2459 2458 case TCP_SEQ_STATE_LISTENING: 2460 - if (st->bucket >= INET_LHTABLE_SIZE) 2459 + if (st->bucket > tcp_hashinfo.lhash2_mask) 2461 2460 break; 2462 2461 st->state = TCP_SEQ_STATE_LISTENING; 2463 2462 rc = listening_get_first(seq); ··· 2542 2541 switch (st->state) { 2543 2542 case TCP_SEQ_STATE_LISTENING: 2544 2543 if (v != SEQ_START_TOKEN) 2545 - spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock); 2544 + spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock); 2546 2545 break; 2547 2546 case TCP_SEQ_STATE_ESTABLISHED: 2548 2547 if (v)