Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

tcp: Add reuseport_migrate_sock() to select a new listener.

reuseport_migrate_sock() does the same check done in
reuseport_listen_stop_sock(). If the reuseport group is capable of
migration, reuseport_migrate_sock() selects a new listener by the child
socket hash and increments the listener's sk_refcnt beforehand. Thus, if we
fail in the migration, we have to decrement it later.

We will support migration by eBPF in the later commits.

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/bpf/20210612123224.12525-5-kuniyu@amazon.co.jp

authored by

Kuniyuki Iwashima and committed by
Daniel Borkmann
1cd62c21 333bb73f

+67 -14
+3
include/net/sock_reuseport.h
··· 37 37 u32 hash, 38 38 struct sk_buff *skb, 39 39 int hdr_len); 40 + struct sock *reuseport_migrate_sock(struct sock *sk, 41 + struct sock *migrating_sk, 42 + struct sk_buff *skb); 40 43 extern int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog); 41 44 extern int reuseport_detach_prog(struct sock *sk); 42 45
+64 -14
net/core/sock_reuseport.c
··· 44 44 struct sock_reuseport *reuse) 45 45 { 46 46 reuse->socks[reuse->num_socks] = sk; 47 - /* paired with smp_rmb() in reuseport_select_sock() */ 47 + /* paired with smp_rmb() in reuseport_(select|migrate)_sock() */ 48 48 smp_wmb(); 49 49 reuse->num_socks++; 50 50 } ··· 434 434 return reuse->socks[index]; 435 435 } 436 436 437 + static struct sock *reuseport_select_sock_by_hash(struct sock_reuseport *reuse, 438 + u32 hash, u16 num_socks) 439 + { 440 + int i, j; 441 + 442 + i = j = reciprocal_scale(hash, num_socks); 443 + while (reuse->socks[i]->sk_state == TCP_ESTABLISHED) { 444 + i++; 445 + if (i >= num_socks) 446 + i = 0; 447 + if (i == j) 448 + return NULL; 449 + } 450 + 451 + return reuse->socks[i]; 452 + } 453 + 437 454 /** 438 455 * reuseport_select_sock - Select a socket from an SO_REUSEPORT group. 439 456 * @sk: First socket in the group. ··· 494 477 495 478 select_by_hash: 496 479 /* no bpf or invalid bpf result: fall back to hash usage */ 497 - if (!sk2) { 498 - int i, j; 499 - 500 - i = j = reciprocal_scale(hash, socks); 501 - while (reuse->socks[i]->sk_state == TCP_ESTABLISHED) { 502 - i++; 503 - if (i >= socks) 504 - i = 0; 505 - if (i == j) 506 - goto out; 507 - } 508 - sk2 = reuse->socks[i]; 509 - } 480 + if (!sk2) 481 + sk2 = reuseport_select_sock_by_hash(reuse, hash, socks); 510 482 } 511 483 512 484 out: ··· 503 497 return sk2; 504 498 } 505 499 EXPORT_SYMBOL(reuseport_select_sock); 500 + 501 + /** 502 + * reuseport_migrate_sock - Select a socket from an SO_REUSEPORT group. 503 + * @sk: close()ed or shutdown()ed socket in the group. 504 + * @migrating_sk: ESTABLISHED/SYN_RECV full socket in the accept queue or 505 + * NEW_SYN_RECV request socket during 3WHS. 506 + * @skb: skb to run through BPF filter. 507 + * Returns a socket (with sk_refcnt +1) that should accept the child socket 508 + * (or NULL on error). 509 + */ 510 + struct sock *reuseport_migrate_sock(struct sock *sk, 511 + struct sock *migrating_sk, 512 + struct sk_buff *skb) 513 + { 514 + struct sock_reuseport *reuse; 515 + struct sock *nsk = NULL; 516 + u16 socks; 517 + u32 hash; 518 + 519 + rcu_read_lock(); 520 + 521 + reuse = rcu_dereference(sk->sk_reuseport_cb); 522 + if (!reuse) 523 + goto out; 524 + 525 + socks = READ_ONCE(reuse->num_socks); 526 + if (unlikely(!socks)) 527 + goto out; 528 + 529 + /* paired with smp_wmb() in __reuseport_add_sock() */ 530 + smp_rmb(); 531 + 532 + hash = migrating_sk->sk_hash; 533 + if (sock_net(sk)->ipv4.sysctl_tcp_migrate_req) 534 + nsk = reuseport_select_sock_by_hash(reuse, hash, socks); 535 + 536 + if (nsk && unlikely(!refcount_inc_not_zero(&nsk->sk_refcnt))) 537 + nsk = NULL; 538 + 539 + out: 540 + rcu_read_unlock(); 541 + return nsk; 542 + } 543 + EXPORT_SYMBOL(reuseport_migrate_sock); 506 544 507 545 int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog) 508 546 {