bpf: udp: Avoid socket skips and repeats during iteration

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Replace the offset-based approach for tracking progress through a bucket
in the UDP table with one based on socket cookies. Remember the cookies
of unprocessed sockets from the last batch and use this list to
pick up where we left off or, in the case that the next socket
disappears between reads, find the first socket after that point that
still exists in the bucket and resume from there.

This approach guarantees that all sockets that existed when iteration
began and continue to exist throughout will be visited exactly once.
Sockets that are added to the table during iteration may or may not be
seen, but if they are they will be seen exactly once.

Signed-off-by: Jordan Rife <jordan@jrife.io>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>

authored by

Jordan Rife and committed by

Martin KaFai Lau 11 months ago 5668f73f 251c6636

+44 -17

1 changed file

expand all

net

ipv4

udp.c

+44 -17

net/ipv4/udp.c

··· 93 93 #include <linux/inet.h> 94 94 #include <linux/netdevice.h> 95 95 #include <linux/slab.h> 96 + #include <linux/sock_diag.h> 96 97 #include <net/tcp_states.h> 97 98 #include <linux/skbuff.h> 98 99 #include <linux/proc_fs.h> ··· 3416 3415 3417 3416 union bpf_udp_iter_batch_item { 3418 3417 struct sock *sk; 3418 + __u64 cookie; 3419 3419 }; 3420 3420 3421 3421 struct bpf_udp_iter_state { ··· 3424 3422 unsigned int cur_sk; 3425 3423 unsigned int end_sk; 3426 3424 unsigned int max_sk; 3427 - int offset; 3428 3425 union bpf_udp_iter_batch_item *batch; 3429 3426 }; 3430 3427 3431 3428 static int bpf_iter_udp_realloc_batch(struct bpf_udp_iter_state *iter, 3432 3429 unsigned int new_batch_sz, gfp_t flags); 3430 + static struct sock *bpf_iter_udp_resume(struct sock *first_sk, 3431 + union bpf_udp_iter_batch_item *cookies, 3432 + int n_cookies) 3433 + { 3434 + struct sock *sk = NULL; 3435 + int i; 3436 + 3437 + for (i = 0; i < n_cookies; i++) { 3438 + sk = first_sk; 3439 + udp_portaddr_for_each_entry_from(sk) 3440 + if (cookies[i].cookie == atomic64_read(&sk->sk_cookie)) 3441 + goto done; 3442 + } 3443 + done: 3444 + return sk; 3445 + } 3446 + 3433 3447 static struct sock *bpf_iter_udp_batch(struct seq_file *seq) 3434 3448 { 3435 3449 struct bpf_udp_iter_state *iter = seq->private; 3436 3450 struct udp_iter_state *state = &iter->state; 3451 + unsigned int find_cookie, end_cookie; 3437 3452 struct net *net = seq_file_net(seq); 3438 - int resume_bucket, resume_offset; 3439 3453 struct udp_table *udptable; 3440 3454 unsigned int batch_sks = 0; 3455 + int resume_bucket; 3441 3456 int resizes = 0; 3442 3457 struct sock *sk; 3443 3458 int err = 0; 3444 3459 3445 3460 resume_bucket = state->bucket; 3446 - resume_offset = iter->offset; 3447 3461 3448 3462 /* The current batch is done, so advance the bucket. */ 3449 3463 if (iter->cur_sk == iter->end_sk) ··· 3475 3457 * before releasing the bucket lock. This allows BPF programs that are 3476 3458 * called in seq_show to acquire the bucket lock if needed. 3477 3459 */ 3460 + find_cookie = iter->cur_sk; 3461 + end_cookie = iter->end_sk; 3478 3462 iter->cur_sk = 0; 3479 3463 iter->end_sk = 0; 3480 3464 batch_sks = 0; ··· 3487 3467 if (hlist_empty(&hslot2->head)) 3488 3468 goto next_bucket; 3489 3469 3490 - iter->offset = 0; 3491 3470 spin_lock_bh(&hslot2->lock); 3492 3471 sk = hlist_entry_safe(hslot2->head.first, struct sock, 3493 3472 __sk_common.skc_portaddr_node); 3473 + /* Resume from the first (in iteration order) unseen socket from 3474 + * the last batch that still exists in resume_bucket. Most of 3475 + * the time this will just be where the last iteration left off 3476 + * in resume_bucket unless that socket disappeared between 3477 + * reads. 3478 + */ 3479 + if (state->bucket == resume_bucket) 3480 + sk = bpf_iter_udp_resume(sk, &iter->batch[find_cookie], 3481 + end_cookie - find_cookie); 3494 3482 fill_batch: 3495 3483 udp_portaddr_for_each_entry_from(sk) { 3496 3484 if (seq_sk_match(seq, sk)) { 3497 - /* Resume from the last iterated socket at the 3498 - * offset in the bucket before iterator was stopped. 3499 - */ 3500 - if (state->bucket == resume_bucket && 3501 - iter->offset < resume_offset) { 3502 - ++iter->offset; 3503 - continue; 3504 - } 3505 3485 if (iter->end_sk < iter->max_sk) { 3506 3486 sock_hold(sk); 3507 3487 iter->batch[iter->end_sk++].sk = sk; ··· 3568 3548 /* Whenever seq_next() is called, the iter->cur_sk is 3569 3549 * done with seq_show(), so unref the iter->cur_sk. 3570 3550 */ 3571 - if (iter->cur_sk < iter->end_sk) { 3551 + if (iter->cur_sk < iter->end_sk) 3572 3552 sock_put(iter->batch[iter->cur_sk++].sk); 3573 - ++iter->offset; 3574 - } 3575 3553 3576 3554 /* After updating iter->cur_sk, check if there are more sockets 3577 3555 * available in the current bucket batch. ··· 3639 3621 3640 3622 static void bpf_iter_udp_put_batch(struct bpf_udp_iter_state *iter) 3641 3623 { 3624 + union bpf_udp_iter_batch_item *item; 3642 3625 unsigned int cur_sk = iter->cur_sk; 3626 + __u64 cookie; 3643 3627 3644 - while (cur_sk < iter->end_sk) 3645 - sock_put(iter->batch[cur_sk++].sk); 3628 + /* Remember the cookies of the sockets we haven't seen yet, so we can 3629 + * pick up where we left off next time around. 3630 + */ 3631 + while (cur_sk < iter->end_sk) { 3632 + item = &iter->batch[cur_sk++]; 3633 + cookie = sock_gen_cookie(item->sk); 3634 + sock_put(item->sk); 3635 + item->cookie = cookie; 3636 + } 3646 3637 } 3647 3638 3648 3639 static void bpf_iter_udp_seq_stop(struct seq_file *seq, void *v)