Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf

Daniel Borkmann says:

====================
pull-request: bpf 2020-12-28

The following pull-request contains BPF updates for your *net* tree.

There is a small merge conflict between bpf tree commit 69ca310f3416
("bpf: Save correct stopping point in file seq iteration") and net tree
commit 66ed594409a1 ("bpf/task_iter: In task_file_seq_get_next use
task_lookup_next_fd_rcu"). The get_files_struct() does not exist anymore
in net, so take the hunk in HEAD and add the `info->tid = curr_tid` to
the error path:

[...]
curr_task = task_seq_get_next(ns, &curr_tid, true);
if (!curr_task) {
info->task = NULL;
info->tid = curr_tid;
return NULL;
}

/* set info->task and info->tid */
[...]

We've added 10 non-merge commits during the last 9 day(s) which contain
a total of 11 files changed, 75 insertions(+), 20 deletions(-).

The main changes are:

1) Various AF_XDP fixes such as fill/completion ring leak on failed bind and
fixing a race in skb mode's backpressure mechanism, from Magnus Karlsson.

2) Fix latency spikes on lockdep enabled kernels by adding a rescheduling
point to BPF hashtab initialization, from Eric Dumazet.

3) Fix a splat in task iterator by saving the correct stopping point in the
seq file iteration, from Jonathan Lemon.

4) Fix BPF maps selftest by adding retries in case hashtab returns EBUSY
errors on update/deletes, from Andrii Nakryiko.

5) Fix BPF selftest error reporting to something more user friendly if the
vmlinux BTF cannot be found, from Kamal Mostafa.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>

+80 -26
-4
include/net/xdp_sock.h
··· 58 58 59 59 struct xsk_queue *tx ____cacheline_aligned_in_smp; 60 60 struct list_head tx_list; 61 - /* Mutual exclusion of NAPI TX thread and sendmsg error paths 62 - * in the SKB destructor callback. 63 - */ 64 - spinlock_t tx_completion_lock; 65 61 /* Protects generic receive. */ 66 62 spinlock_t rx_lock; 67 63
+5
include/net/xsk_buff_pool.h
··· 73 73 bool dma_need_sync; 74 74 bool unaligned; 75 75 void *addrs; 76 + /* Mutual exclusion of the completion ring in the SKB mode. Two cases to protect: 77 + * NAPI TX thread and sendmsg error paths in the SKB destructor callback and when 78 + * sockets share a single cq when the same netdev and queue id is shared. 79 + */ 80 + spinlock_t cq_lock; 76 81 struct xdp_buff_xsk *free_heads[]; 77 82 }; 78 83
+1
kernel/bpf/hashtab.c
··· 152 152 lockdep_set_class(&htab->buckets[i].lock, 153 153 &htab->lockdep_key); 154 154 } 155 + cond_resched(); 155 156 } 156 157 } 157 158
-1
kernel/bpf/syscall.c
··· 17 17 #include <linux/fs.h> 18 18 #include <linux/license.h> 19 19 #include <linux/filter.h> 20 - #include <linux/version.h> 21 20 #include <linux/kernel.h> 22 21 #include <linux/idr.h> 23 22 #include <linux/cred.h>
+8 -8
kernel/bpf/task_iter.c
··· 37 37 if (!task) { 38 38 ++*tid; 39 39 goto retry; 40 - } else if (skip_if_dup_files && task->tgid != task->pid && 40 + } else if (skip_if_dup_files && !thread_group_leader(task) && 41 41 task->files == task->group_leader->files) { 42 42 put_task_struct(task); 43 43 task = NULL; ··· 151 151 curr_task = info->task; 152 152 curr_fd = info->fd; 153 153 } else { 154 - curr_task = task_seq_get_next(ns, &curr_tid, true); 155 - if (!curr_task) { 156 - info->task = NULL; 157 - return NULL; 158 - } 154 + curr_task = task_seq_get_next(ns, &curr_tid, true); 155 + if (!curr_task) { 156 + info->task = NULL; 157 + info->tid = curr_tid; 158 + return NULL; 159 + } 159 160 160 - /* set info->task and info->tid */ 161 - info->task = curr_task; 161 + /* set info->task and info->tid */ 162 162 if (curr_tid == info->tid) { 163 163 curr_fd = info->fd; 164 164 } else {
+13 -3
net/xdp/xsk.c
··· 423 423 struct xdp_sock *xs = xdp_sk(skb->sk); 424 424 unsigned long flags; 425 425 426 - spin_lock_irqsave(&xs->tx_completion_lock, flags); 426 + spin_lock_irqsave(&xs->pool->cq_lock, flags); 427 427 xskq_prod_submit_addr(xs->pool->cq, addr); 428 - spin_unlock_irqrestore(&xs->tx_completion_lock, flags); 428 + spin_unlock_irqrestore(&xs->pool->cq_lock, flags); 429 429 430 430 sock_wfree(skb); 431 431 } ··· 437 437 bool sent_frame = false; 438 438 struct xdp_desc desc; 439 439 struct sk_buff *skb; 440 + unsigned long flags; 440 441 int err = 0; 441 442 442 443 mutex_lock(&xs->mutex); ··· 469 468 * if there is space in it. This avoids having to implement 470 469 * any buffering in the Tx path. 471 470 */ 471 + spin_lock_irqsave(&xs->pool->cq_lock, flags); 472 472 if (unlikely(err) || xskq_prod_reserve(xs->pool->cq)) { 473 + spin_unlock_irqrestore(&xs->pool->cq_lock, flags); 473 474 kfree_skb(skb); 474 475 goto out; 475 476 } 477 + spin_unlock_irqrestore(&xs->pool->cq_lock, flags); 476 478 477 479 skb->dev = xs->dev; 478 480 skb->priority = sk->sk_priority; ··· 487 483 if (err == NETDEV_TX_BUSY) { 488 484 /* Tell user-space to retry the send */ 489 485 skb->destructor = sock_wfree; 486 + spin_lock_irqsave(&xs->pool->cq_lock, flags); 487 + xskq_prod_cancel(xs->pool->cq); 488 + spin_unlock_irqrestore(&xs->pool->cq_lock, flags); 490 489 /* Free skb without triggering the perf drop trace */ 491 490 consume_skb(skb); 492 491 err = -EAGAIN; ··· 884 877 goto out_unlock; 885 878 } 886 879 } 880 + 881 + /* FQ and CQ are now owned by the buffer pool and cleaned up with it. */ 882 + xs->fq_tmp = NULL; 883 + xs->cq_tmp = NULL; 887 884 888 885 xs->dev = dev; 889 886 xs->zc = xs->umem->zc; ··· 1310 1299 xs->state = XSK_READY; 1311 1300 mutex_init(&xs->mutex); 1312 1301 spin_lock_init(&xs->rx_lock); 1313 - spin_lock_init(&xs->tx_completion_lock); 1314 1302 1315 1303 INIT_LIST_HEAD(&xs->map_list); 1316 1304 spin_lock_init(&xs->map_list_lock);
+1 -2
net/xdp/xsk_buff_pool.c
··· 71 71 INIT_LIST_HEAD(&pool->free_list); 72 72 INIT_LIST_HEAD(&pool->xsk_tx_list); 73 73 spin_lock_init(&pool->xsk_tx_list_lock); 74 + spin_lock_init(&pool->cq_lock); 74 75 refcount_set(&pool->users, 1); 75 76 76 77 pool->fq = xs->fq_tmp; 77 78 pool->cq = xs->cq_tmp; 78 - xs->fq_tmp = NULL; 79 - xs->cq_tmp = NULL; 80 79 81 80 for (i = 0; i < pool->free_heads_cnt; i++) { 82 81 xskb = &pool->heads[i];
+5
net/xdp/xsk_queue.h
··· 334 334 return xskq_prod_nb_free(q, 1) ? false : true; 335 335 } 336 336 337 + static inline void xskq_prod_cancel(struct xsk_queue *q) 338 + { 339 + q->cached_prod--; 340 + } 341 + 337 342 static inline int xskq_prod_reserve(struct xsk_queue *q) 338 343 { 339 344 if (xskq_prod_is_full(q))
+3
tools/testing/selftests/bpf/Makefile
··· 121 121 /sys/kernel/btf/vmlinux \ 122 122 /boot/vmlinux-$(shell uname -r) 123 123 VMLINUX_BTF ?= $(abspath $(firstword $(wildcard $(VMLINUX_BTF_PATHS)))) 124 + ifeq ($(VMLINUX_BTF),) 125 + $(error Cannot find a vmlinux for VMLINUX_BTF at any of "$(VMLINUX_BTF_PATHS)") 126 + endif 124 127 125 128 # Define simple and short `make test_progs`, `make test_sysctl`, etc targets 126 129 # to build individual tests.
+42 -6
tools/testing/selftests/bpf/test_maps.c
··· 1312 1312 #define DO_UPDATE 1 1313 1313 #define DO_DELETE 0 1314 1314 1315 + #define MAP_RETRIES 20 1316 + 1317 + static int map_update_retriable(int map_fd, const void *key, const void *value, 1318 + int flags, int attempts) 1319 + { 1320 + while (bpf_map_update_elem(map_fd, key, value, flags)) { 1321 + if (!attempts || (errno != EAGAIN && errno != EBUSY)) 1322 + return -errno; 1323 + 1324 + usleep(1); 1325 + attempts--; 1326 + } 1327 + 1328 + return 0; 1329 + } 1330 + 1331 + static int map_delete_retriable(int map_fd, const void *key, int attempts) 1332 + { 1333 + while (bpf_map_delete_elem(map_fd, key)) { 1334 + if (!attempts || (errno != EAGAIN && errno != EBUSY)) 1335 + return -errno; 1336 + 1337 + usleep(1); 1338 + attempts--; 1339 + } 1340 + 1341 + return 0; 1342 + } 1343 + 1315 1344 static void test_update_delete(unsigned int fn, void *data) 1316 1345 { 1317 1346 int do_update = ((int *)data)[1]; 1318 1347 int fd = ((int *)data)[0]; 1319 - int i, key, value; 1348 + int i, key, value, err; 1320 1349 1321 1350 for (i = fn; i < MAP_SIZE; i += TASKS) { 1322 1351 key = value = i; 1323 1352 1324 1353 if (do_update) { 1325 - assert(bpf_map_update_elem(fd, &key, &value, 1326 - BPF_NOEXIST) == 0); 1327 - assert(bpf_map_update_elem(fd, &key, &value, 1328 - BPF_EXIST) == 0); 1354 + err = map_update_retriable(fd, &key, &value, BPF_NOEXIST, MAP_RETRIES); 1355 + if (err) 1356 + printf("error %d %d\n", err, errno); 1357 + assert(err == 0); 1358 + err = map_update_retriable(fd, &key, &value, BPF_EXIST, MAP_RETRIES); 1359 + if (err) 1360 + printf("error %d %d\n", err, errno); 1361 + assert(err == 0); 1329 1362 } else { 1330 - assert(bpf_map_delete_elem(fd, &key) == 0); 1363 + err = map_delete_retriable(fd, &key, MAP_RETRIES); 1364 + if (err) 1365 + printf("error %d %d\n", err, errno); 1366 + assert(err == 0); 1331 1367 } 1332 1368 } 1333 1369 }
+2 -2
tools/testing/selftests/bpf/xdpxceiver.c
··· 715 715 int payload = *((uint32_t *)(pkt_buf[iter]->payload + PKT_HDR_SIZE)); 716 716 717 717 if (payload == EOT) { 718 - ksft_print_msg("End-of-tranmission frame received\n"); 718 + ksft_print_msg("End-of-transmission frame received\n"); 719 719 fprintf(stdout, "---------------------------------------\n"); 720 720 break; 721 721 } ··· 747 747 } 748 748 749 749 if (payloadseqnum == EOT) { 750 - ksft_print_msg("End-of-tranmission frame received: PASS\n"); 750 + ksft_print_msg("End-of-transmission frame received: PASS\n"); 751 751 sigvar = 1; 752 752 break; 753 753 }