Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

tcp: Add num_closed_socks to struct sock_reuseport.

As noted in the following commit, a closed listener has to hold the
reference to the reuseport group for socket migration. This patch adds a
field (num_closed_socks) to struct sock_reuseport to manage closed sockets
within the same reuseport group. Moreover, this and the following commits
introduce some helper functions to split socks[] into two sections and keep
TCP_LISTEN and TCP_CLOSE sockets in each section. Like a double-ended
queue, we will place TCP_LISTEN sockets from the front and TCP_CLOSE
sockets from the end.

TCP_LISTEN----------> <-------TCP_CLOSE
+---+---+ --- +---+ --- +---+ --- +---+
| 0 | 1 | ... | i | ... | j | ... | k |
+---+---+ --- +---+ --- +---+ --- +---+

i = num_socks - 1
j = max_socks - num_closed_socks
k = max_socks - 1

This patch also extends reuseport_add_sock() and reuseport_grow() to
support num_closed_socks.

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20210612123224.12525-3-kuniyu@amazon.co.jp

authored by

Kuniyuki Iwashima and committed by
Daniel Borkmann
5c040eaf f9ac779f

+60 -20
+3 -2
include/net/sock_reuseport.h
··· 13 13 struct sock_reuseport { 14 14 struct rcu_head rcu; 15 15 16 - u16 max_socks; /* length of socks */ 17 - u16 num_socks; /* elements in socks */ 16 + u16 max_socks; /* length of socks */ 17 + u16 num_socks; /* elements in socks */ 18 + u16 num_closed_socks; /* closed elements in socks */ 18 19 /* The last synq overflow event timestamp of this 19 20 * reuse->socks[] group. 20 21 */
+57 -18
net/core/sock_reuseport.c
··· 18 18 19 19 static DEFINE_IDA(reuseport_ida); 20 20 21 + static int reuseport_sock_index(struct sock *sk, 22 + const struct sock_reuseport *reuse, 23 + bool closed) 24 + { 25 + int left, right; 26 + 27 + if (!closed) { 28 + left = 0; 29 + right = reuse->num_socks; 30 + } else { 31 + left = reuse->max_socks - reuse->num_closed_socks; 32 + right = reuse->max_socks; 33 + } 34 + 35 + for (; left < right; left++) 36 + if (reuse->socks[left] == sk) 37 + return left; 38 + return -1; 39 + } 40 + 41 + static void __reuseport_add_sock(struct sock *sk, 42 + struct sock_reuseport *reuse) 43 + { 44 + reuse->socks[reuse->num_socks] = sk; 45 + /* paired with smp_rmb() in reuseport_select_sock() */ 46 + smp_wmb(); 47 + reuse->num_socks++; 48 + } 49 + 50 + static bool __reuseport_detach_sock(struct sock *sk, 51 + struct sock_reuseport *reuse) 52 + { 53 + int i = reuseport_sock_index(sk, reuse, false); 54 + 55 + if (i == -1) 56 + return false; 57 + 58 + reuse->socks[i] = reuse->socks[reuse->num_socks - 1]; 59 + reuse->num_socks--; 60 + 61 + return true; 62 + } 63 + 21 64 static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks) 22 65 { 23 66 unsigned int size = sizeof(struct sock_reuseport) + ··· 115 72 } 116 73 117 74 reuse->reuseport_id = id; 75 + reuse->bind_inany = bind_inany; 118 76 reuse->socks[0] = sk; 119 77 reuse->num_socks = 1; 120 - reuse->bind_inany = bind_inany; 121 78 rcu_assign_pointer(sk->sk_reuseport_cb, reuse); 122 79 123 80 out: ··· 141 98 return NULL; 142 99 143 100 more_reuse->num_socks = reuse->num_socks; 101 + more_reuse->num_closed_socks = reuse->num_closed_socks; 144 102 more_reuse->prog = reuse->prog; 145 103 more_reuse->reuseport_id = reuse->reuseport_id; 146 104 more_reuse->bind_inany = reuse->bind_inany; ··· 149 105 150 106 memcpy(more_reuse->socks, reuse->socks, 151 107 reuse->num_socks * sizeof(struct sock *)); 108 + memcpy(more_reuse->socks + 109 + (more_reuse->max_socks - more_reuse->num_closed_socks), 110 + reuse->socks + (reuse->max_socks - reuse->num_closed_socks), 111 + reuse->num_closed_socks * sizeof(struct sock *)); 152 112 more_reuse->synq_overflow_ts = READ_ONCE(reuse->synq_overflow_ts); 153 113 154 - for (i = 0; i < reuse->num_socks; ++i) 114 + for (i = 0; i < reuse->max_socks; ++i) 155 115 rcu_assign_pointer(reuse->socks[i]->sk_reuseport_cb, 156 116 more_reuse); 157 117 ··· 206 158 return -EBUSY; 207 159 } 208 160 209 - if (reuse->num_socks == reuse->max_socks) { 161 + if (reuse->num_socks + reuse->num_closed_socks == reuse->max_socks) { 210 162 reuse = reuseport_grow(reuse); 211 163 if (!reuse) { 212 164 spin_unlock_bh(&reuseport_lock); ··· 214 166 } 215 167 } 216 168 217 - reuse->socks[reuse->num_socks] = sk; 218 - /* paired with smp_rmb() in reuseport_select_sock() */ 219 - smp_wmb(); 220 - reuse->num_socks++; 169 + __reuseport_add_sock(sk, reuse); 221 170 rcu_assign_pointer(sk->sk_reuseport_cb, reuse); 222 171 223 172 spin_unlock_bh(&reuseport_lock); ··· 228 183 void reuseport_detach_sock(struct sock *sk) 229 184 { 230 185 struct sock_reuseport *reuse; 231 - int i; 232 186 233 187 spin_lock_bh(&reuseport_lock); 234 188 reuse = rcu_dereference_protected(sk->sk_reuseport_cb, ··· 244 200 bpf_sk_reuseport_detach(sk); 245 201 246 202 rcu_assign_pointer(sk->sk_reuseport_cb, NULL); 203 + __reuseport_detach_sock(sk, reuse); 247 204 248 - for (i = 0; i < reuse->num_socks; i++) { 249 - if (reuse->socks[i] == sk) { 250 - reuse->socks[i] = reuse->socks[reuse->num_socks - 1]; 251 - reuse->num_socks--; 252 - if (reuse->num_socks == 0) 253 - call_rcu(&reuse->rcu, reuseport_free_rcu); 254 - break; 255 - } 256 - } 205 + if (reuse->num_socks + reuse->num_closed_socks == 0) 206 + call_rcu(&reuse->rcu, reuseport_free_rcu); 207 + 257 208 spin_unlock_bh(&reuseport_lock); 258 209 } 259 210 EXPORT_SYMBOL(reuseport_detach_sock); ··· 313 274 prog = rcu_dereference(reuse->prog); 314 275 socks = READ_ONCE(reuse->num_socks); 315 276 if (likely(socks)) { 316 - /* paired with smp_wmb() in reuseport_add_sock() */ 277 + /* paired with smp_wmb() in __reuseport_add_sock() */ 317 278 smp_rmb(); 318 279 319 280 if (!prog || !skb)