Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'for-netdev' of https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next

Daniel Borkmann says:

====================
pull-request: bpf-next 2024-09-11

We've added 12 non-merge commits during the last 16 day(s) which contain
a total of 20 files changed, 228 insertions(+), 30 deletions(-).

There's a minor merge conflict in drivers/net/netkit.c:
00d066a4d4ed ("netdev_features: convert NETIF_F_LLTX to dev->lltx")
d96608794889 ("netkit: Disable netpoll support")

The main changes are:

1) Enable bpf_dynptr_from_skb for tp_btf such that this can be used
to easily parse skbs in BPF programs attached to tracepoints,
from Philo Lu.

2) Add a cond_resched() point in BPF's sock_hash_free() as there have
been several syzbot soft lockup reports recently, from Eric Dumazet.

3) Fix xsk_buff_can_alloc() to account for queue_empty_descs which
got noticed when zero copy ice driver started to use it,
from Maciej Fijalkowski.

4) Move the xdp:xdp_cpumap_kthread tracepoint before cpumap pushes skbs
up via netif_receive_skb_list() to better measure latencies,
from Daniel Xu.

5) Follow-up to disable netpoll support from netkit, from Daniel Borkmann.

6) Improve xsk selftests to not assume a fixed MAX_SKB_FRAGS of 17 but
instead gather the actual value via /proc/sys/net/core/max_skb_frags,
also from Maciej Fijalkowski.

* tag 'for-netdev' of https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next:
sock_map: Add a cond_resched() in sock_hash_free()
selftests/bpf: Expand skb dynptr selftests for tp_btf
bpf: Allow bpf_dynptr_from_skb() for tp_btf
tcp: Use skb__nullable in trace_tcp_send_reset
selftests/bpf: Add test for __nullable suffix in tp_btf
bpf: Support __nullable argument suffix for tp_btf
bpf, cpumap: Move xdp:xdp_cpumap_kthread tracepoint before rcv
selftests/xsk: Read current MAX_SKB_FRAGS from sysctl knob
xsk: Bump xsk_queue::queue_empty_descs in xp_can_alloc()
tcp_bpf: Remove an unused parameter for bpf_tcp_ingress()
bpf, sockmap: Correct spelling skmsg.c
netkit: Disable netpoll support

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
====================

Link: https://patch.msgid.link/20240911211525.13834-1-daniel@iogearbox.net
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

+228 -30
+1
drivers/net/netkit.c
··· 255 255 dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; 256 256 dev->priv_flags |= IFF_PHONY_HEADROOM; 257 257 dev->priv_flags |= IFF_NO_QUEUE; 258 + dev->priv_flags |= IFF_DISABLE_NETPOLL; 258 259 dev->lltx = true; 259 260 260 261 dev->ethtool_ops = &netkit_ethtool_ops;
+6 -6
include/trace/events/tcp.h
··· 91 91 TRACE_EVENT(tcp_send_reset, 92 92 93 93 TP_PROTO(const struct sock *sk, 94 - const struct sk_buff *skb, 94 + const struct sk_buff *skb__nullable, 95 95 const enum sk_rst_reason reason), 96 96 97 - TP_ARGS(sk, skb, reason), 97 + TP_ARGS(sk, skb__nullable, reason), 98 98 99 99 TP_STRUCT__entry( 100 100 __field(const void *, skbaddr) ··· 106 106 ), 107 107 108 108 TP_fast_assign( 109 - __entry->skbaddr = skb; 109 + __entry->skbaddr = skb__nullable; 110 110 __entry->skaddr = sk; 111 111 /* Zero means unknown state. */ 112 112 __entry->state = sk ? sk->sk_state : 0; ··· 118 118 const struct inet_sock *inet = inet_sk(sk); 119 119 120 120 TP_STORE_ADDR_PORTS(__entry, inet, sk); 121 - } else if (skb) { 122 - const struct tcphdr *th = (const struct tcphdr *)skb->data; 121 + } else if (skb__nullable) { 122 + const struct tcphdr *th = (const struct tcphdr *)skb__nullable->data; 123 123 /* 124 124 * We should reverse the 4-tuple of skb, so later 125 125 * it can print the right flow direction of rst. 126 126 */ 127 - TP_STORE_ADDR_PORTS_SKB(skb, th, entry->daddr, entry->saddr); 127 + TP_STORE_ADDR_PORTS_SKB(skb__nullable, th, entry->daddr, entry->saddr); 128 128 } 129 129 __entry->reason = reason; 130 130 ),
+3
kernel/bpf/btf.c
··· 6525 6525 if (prog_args_trusted(prog)) 6526 6526 info->reg_type |= PTR_TRUSTED; 6527 6527 6528 + if (btf_param_match_suffix(btf, &args[arg], "__nullable")) 6529 + info->reg_type |= PTR_MAYBE_NULL; 6530 + 6528 6531 if (tgt_prog) { 6529 6532 enum bpf_prog_type tgt_type; 6530 6533
+4 -2
kernel/bpf/cpumap.c
··· 354 354 355 355 list_add_tail(&skb->list, &list); 356 356 } 357 - netif_receive_skb_list(&list); 358 357 359 - /* Feedback loop via tracepoint */ 358 + /* Feedback loop via tracepoint. 359 + * NB: keep before recv to allow measuring enqueue/dequeue latency. 360 + */ 360 361 trace_xdp_cpumap_kthread(rcpu->map_id, n, kmem_alloc_drops, 361 362 sched, &stats); 362 363 364 + netif_receive_skb_list(&list); 363 365 local_bh_enable(); /* resched point, may call do_softirq() */ 364 366 } 365 367 __set_current_state(TASK_RUNNING);
+32 -4
kernel/bpf/verifier.c
··· 28 28 #include <linux/cpumask.h> 29 29 #include <linux/bpf_mem_alloc.h> 30 30 #include <net/xdp.h> 31 + #include <linux/trace_events.h> 32 + #include <linux/kallsyms.h> 31 33 32 34 #include "disasm.h" 33 35 ··· 21156 21154 { 21157 21155 bool prog_extension = prog->type == BPF_PROG_TYPE_EXT; 21158 21156 bool prog_tracing = prog->type == BPF_PROG_TYPE_TRACING; 21157 + char trace_symbol[KSYM_SYMBOL_LEN]; 21159 21158 const char prefix[] = "btf_trace_"; 21159 + struct bpf_raw_event_map *btp; 21160 21160 int ret = 0, subprog = -1, i; 21161 21161 const struct btf_type *t; 21162 21162 bool conservative = true; 21163 - const char *tname; 21163 + const char *tname, *fname; 21164 21164 struct btf *btf; 21165 21165 long addr = 0; 21166 21166 struct module *mod = NULL; ··· 21293 21289 return -EINVAL; 21294 21290 } 21295 21291 tname += sizeof(prefix) - 1; 21296 - t = btf_type_by_id(btf, t->type); 21297 - if (!btf_type_is_ptr(t)) 21298 - /* should never happen in valid vmlinux build */ 21292 + 21293 + /* The func_proto of "btf_trace_##tname" is generated from typedef without argument 21294 + * names. Thus using bpf_raw_event_map to get argument names. 21295 + */ 21296 + btp = bpf_get_raw_tracepoint(tname); 21297 + if (!btp) 21299 21298 return -EINVAL; 21299 + fname = kallsyms_lookup((unsigned long)btp->bpf_func, NULL, NULL, NULL, 21300 + trace_symbol); 21301 + bpf_put_raw_tracepoint(btp); 21302 + 21303 + if (fname) 21304 + ret = btf_find_by_name_kind(btf, fname, BTF_KIND_FUNC); 21305 + 21306 + if (!fname || ret < 0) { 21307 + bpf_log(log, "Cannot find btf of tracepoint template, fall back to %s%s.\n", 21308 + prefix, tname); 21309 + t = btf_type_by_id(btf, t->type); 21310 + if (!btf_type_is_ptr(t)) 21311 + /* should never happen in valid vmlinux build */ 21312 + return -EINVAL; 21313 + } else { 21314 + t = btf_type_by_id(btf, ret); 21315 + if (!btf_type_is_func(t)) 21316 + /* should never happen in valid vmlinux build */ 21317 + return -EINVAL; 21318 + } 21319 + 21300 21320 t = btf_type_by_id(btf, t->type); 21301 21321 if (!btf_type_is_func_proto(t)) 21302 21322 /* should never happen in valid vmlinux build */
+2 -1
net/core/filter.c
··· 12063 12063 } 12064 12064 12065 12065 BTF_KFUNCS_START(bpf_kfunc_check_set_skb) 12066 - BTF_ID_FLAGS(func, bpf_dynptr_from_skb) 12066 + BTF_ID_FLAGS(func, bpf_dynptr_from_skb, KF_TRUSTED_ARGS) 12067 12067 BTF_KFUNCS_END(bpf_kfunc_check_set_skb) 12068 12068 12069 12069 BTF_KFUNCS_START(bpf_kfunc_check_set_xdp) ··· 12112 12112 ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_XMIT, &bpf_kfunc_set_skb); 12113 12113 ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_SEG6LOCAL, &bpf_kfunc_set_skb); 12114 12114 ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_NETFILTER, &bpf_kfunc_set_skb); 12115 + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_kfunc_set_skb); 12115 12116 ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &bpf_kfunc_set_xdp); 12116 12117 ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, 12117 12118 &bpf_kfunc_set_sock_addr);
+1 -1
net/core/skmsg.c
··· 293 293 /* If we trim data a full sg elem before curr pointer update 294 294 * copybreak and current so that any future copy operations 295 295 * start at new copy location. 296 - * However trimed data that has not yet been used in a copy op 296 + * However trimmed data that has not yet been used in a copy op 297 297 * does not require an update. 298 298 */ 299 299 if (!msg->sg.size) {
+1
net/core/sock_map.c
··· 1183 1183 sock_put(elem->sk); 1184 1184 sock_hash_free_elem(htab, elem); 1185 1185 } 1186 + cond_resched(); 1186 1187 } 1187 1188 1188 1189 /* wait for psock readers accessing its map link */
+2 -2
net/ipv4/tcp_bpf.c
··· 30 30 } 31 31 32 32 static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock, 33 - struct sk_msg *msg, u32 apply_bytes, int flags) 33 + struct sk_msg *msg, u32 apply_bytes) 34 34 { 35 35 bool apply = apply_bytes; 36 36 struct scatterlist *sge; ··· 167 167 if (unlikely(!psock)) 168 168 return -EPIPE; 169 169 170 - ret = ingress ? bpf_tcp_ingress(sk, psock, msg, bytes, flags) : 170 + ret = ingress ? bpf_tcp_ingress(sk, psock, msg, bytes) : 171 171 tcp_bpf_push_locked(sk, msg, bytes, flags, false); 172 172 sk_psock_put(sk, psock); 173 173 return ret;
+9 -1
net/xdp/xsk_buff_pool.c
··· 661 661 662 662 bool xp_can_alloc(struct xsk_buff_pool *pool, u32 count) 663 663 { 664 + u32 req_count, avail_count; 665 + 664 666 if (pool->free_list_cnt >= count) 665 667 return true; 666 - return xskq_cons_has_entries(pool->fq, count - pool->free_list_cnt); 668 + 669 + req_count = count - pool->free_list_cnt; 670 + avail_count = xskq_cons_nb_entries(pool->fq, req_count); 671 + if (!avail_count) 672 + pool->fq->queue_empty_descs++; 673 + 674 + return avail_count >= req_count; 667 675 } 668 676 EXPORT_SYMBOL(xp_can_alloc); 669 677
-5
net/xdp/xsk_queue.h
··· 306 306 return entries >= max ? max : entries; 307 307 } 308 308 309 - static inline bool xskq_cons_has_entries(struct xsk_queue *q, u32 cnt) 310 - { 311 - return xskq_cons_nb_entries(q, cnt) >= cnt; 312 - } 313 - 314 309 static inline bool xskq_cons_peek_addr_unchecked(struct xsk_queue *q, u64 *addr) 315 310 { 316 311 if (q->cached_prod == q->cached_cons)
+6
tools/testing/selftests/bpf/bpf_testmod/bpf_testmod-events.h
··· 34 34 TP_ARGS(task, ctx) 35 35 ); 36 36 37 + /* Used in bpf_testmod_test_read() to test __nullable suffix */ 38 + DECLARE_TRACE(bpf_testmod_test_nullable_bare, 39 + TP_PROTO(struct bpf_testmod_test_read_ctx *ctx__nullable), 40 + TP_ARGS(ctx__nullable) 41 + ); 42 + 37 43 #undef BPF_TESTMOD_DECLARE_TRACE 38 44 #ifdef DECLARE_TRACE_WRITABLE 39 45 #define BPF_TESTMOD_DECLARE_TRACE(call, proto, args, size) \
+2
tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
··· 356 356 if (bpf_testmod_loop_test(101) > 100) 357 357 trace_bpf_testmod_test_read(current, &ctx); 358 358 359 + trace_bpf_testmod_test_nullable_bare(NULL); 360 + 359 361 /* Magic number to enable writable tp */ 360 362 if (len == 64) { 361 363 struct bpf_testmod_test_writable_ctx writable = {
+35 -2
tools/testing/selftests/bpf/prog_tests/dynptr.c
··· 9 9 enum test_setup_type { 10 10 SETUP_SYSCALL_SLEEP, 11 11 SETUP_SKB_PROG, 12 + SETUP_SKB_PROG_TP, 12 13 }; 13 14 14 15 static struct { ··· 29 28 {"test_dynptr_clone", SETUP_SKB_PROG}, 30 29 {"test_dynptr_skb_no_buff", SETUP_SKB_PROG}, 31 30 {"test_dynptr_skb_strcmp", SETUP_SKB_PROG}, 31 + {"test_dynptr_skb_tp_btf", SETUP_SKB_PROG_TP}, 32 32 }; 33 33 34 34 static void verify_success(const char *prog_name, enum test_setup_type setup_type) ··· 37 35 struct dynptr_success *skel; 38 36 struct bpf_program *prog; 39 37 struct bpf_link *link; 40 - int err; 38 + int err; 41 39 42 40 skel = dynptr_success__open(); 43 41 if (!ASSERT_OK_PTR(skel, "dynptr_success__open")) ··· 49 47 if (!ASSERT_OK_PTR(prog, "bpf_object__find_program_by_name")) 50 48 goto cleanup; 51 49 52 - bpf_program__set_autoload(prog, true); 50 + bpf_program__set_autoload(prog, true); 53 51 54 52 err = dynptr_success__load(skel); 55 53 if (!ASSERT_OK(err, "dynptr_success__load")) ··· 83 81 goto cleanup; 84 82 85 83 err = bpf_prog_test_run_opts(prog_fd, &topts); 84 + 85 + if (!ASSERT_OK(err, "test_run")) 86 + goto cleanup; 87 + 88 + break; 89 + } 90 + case SETUP_SKB_PROG_TP: 91 + { 92 + struct __sk_buff skb = {}; 93 + struct bpf_object *obj; 94 + int aux_prog_fd; 95 + 96 + /* Just use its test_run to trigger kfree_skb tracepoint */ 97 + err = bpf_prog_test_load("./test_pkt_access.bpf.o", BPF_PROG_TYPE_SCHED_CLS, 98 + &obj, &aux_prog_fd); 99 + if (!ASSERT_OK(err, "prog_load sched cls")) 100 + goto cleanup; 101 + 102 + LIBBPF_OPTS(bpf_test_run_opts, topts, 103 + .data_in = &pkt_v4, 104 + .data_size_in = sizeof(pkt_v4), 105 + .ctx_in = &skb, 106 + .ctx_size_in = sizeof(skb), 107 + ); 108 + 109 + link = bpf_program__attach(prog); 110 + if (!ASSERT_OK_PTR(link, "bpf_program__attach")) 111 + goto cleanup; 112 + 113 + err = bpf_prog_test_run_opts(aux_prog_fd, &topts); 114 + bpf_link__destroy(link); 86 115 87 116 if (!ASSERT_OK(err, "test_run")) 88 117 goto cleanup;
+14
tools/testing/selftests/bpf/prog_tests/tp_btf_nullable.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + 3 + #include <test_progs.h> 4 + #include "test_tp_btf_nullable.skel.h" 5 + 6 + void test_tp_btf_nullable(void) 7 + { 8 + if (!env.has_testmod) { 9 + test__skip(); 10 + return; 11 + } 12 + 13 + RUN_TESTS(test_tp_btf_nullable); 14 + }
+25
tools/testing/selftests/bpf/progs/dynptr_fail.c
··· 6 6 #include <stdbool.h> 7 7 #include <linux/bpf.h> 8 8 #include <bpf/bpf_helpers.h> 9 + #include <bpf/bpf_tracing.h> 9 10 #include <linux/if_ether.h> 10 11 #include "bpf_misc.h" 11 12 #include "bpf_kfuncs.h" ··· 1251 1250 1252 1251 /* this should fail */ 1253 1252 bpf_dynptr_from_skb(ctx, 0, &ptr); 1253 + 1254 + return 0; 1255 + } 1256 + 1257 + SEC("fentry/skb_tx_error") 1258 + __failure __msg("must be referenced or trusted") 1259 + int BPF_PROG(skb_invalid_ctx_fentry, void *skb) 1260 + { 1261 + struct bpf_dynptr ptr; 1262 + 1263 + /* this should fail */ 1264 + bpf_dynptr_from_skb(skb, 0, &ptr); 1265 + 1266 + return 0; 1267 + } 1268 + 1269 + SEC("fexit/skb_tx_error") 1270 + __failure __msg("must be referenced or trusted") 1271 + int BPF_PROG(skb_invalid_ctx_fexit, void *skb) 1272 + { 1273 + struct bpf_dynptr ptr; 1274 + 1275 + /* this should fail */ 1276 + bpf_dynptr_from_skb(skb, 0, &ptr); 1254 1277 1255 1278 return 0; 1256 1279 }
+23
tools/testing/selftests/bpf/progs/dynptr_success.c
··· 5 5 #include <stdbool.h> 6 6 #include <linux/bpf.h> 7 7 #include <bpf/bpf_helpers.h> 8 + #include <bpf/bpf_tracing.h> 8 9 #include "bpf_misc.h" 9 10 #include "bpf_kfuncs.h" 10 11 #include "errno.h" ··· 540 539 data = bpf_dynptr_slice(&ptr, 0, NULL, 10); 541 540 if (data) { 542 541 bpf_strncmp(data, 10, "foo"); 542 + return 1; 543 + } 544 + 545 + return 1; 546 + } 547 + 548 + SEC("tp_btf/kfree_skb") 549 + int BPF_PROG(test_dynptr_skb_tp_btf, void *skb, void *location) 550 + { 551 + __u8 write_data[2] = {1, 2}; 552 + struct bpf_dynptr ptr; 553 + int ret; 554 + 555 + if (bpf_dynptr_from_skb(skb, 0, &ptr)) { 556 + err = 1; 557 + return 1; 558 + } 559 + 560 + /* since tp_btf skbs are read only, writes should fail */ 561 + ret = bpf_dynptr_write(&ptr, 0, write_data, sizeof(write_data), 0); 562 + if (ret != -EINVAL) { 563 + err = 2; 543 564 return 1; 544 565 } 545 566
+24
tools/testing/selftests/bpf/progs/test_tp_btf_nullable.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + 3 + #include "vmlinux.h" 4 + #include <bpf/bpf_helpers.h> 5 + #include <bpf/bpf_tracing.h> 6 + #include "../bpf_testmod/bpf_testmod.h" 7 + #include "bpf_misc.h" 8 + 9 + SEC("tp_btf/bpf_testmod_test_nullable_bare") 10 + __failure __msg("R1 invalid mem access 'trusted_ptr_or_null_'") 11 + int BPF_PROG(handle_tp_btf_nullable_bare1, struct bpf_testmod_test_read_ctx *nullable_ctx) 12 + { 13 + return nullable_ctx->len; 14 + } 15 + 16 + SEC("tp_btf/bpf_testmod_test_nullable_bare") 17 + int BPF_PROG(handle_tp_btf_nullable_bare2, struct bpf_testmod_test_read_ctx *nullable_ctx) 18 + { 19 + if (nullable_ctx) 20 + return nullable_ctx->len; 21 + return 0; 22 + } 23 + 24 + char _license[] SEC("license") = "GPL";
+38 -5
tools/testing/selftests/bpf/xskxceiver.c
··· 324 324 return zc_avail; 325 325 } 326 326 327 + #define MAX_SKB_FRAGS_PATH "/proc/sys/net/core/max_skb_frags" 328 + static unsigned int get_max_skb_frags(void) 329 + { 330 + unsigned int max_skb_frags = 0; 331 + FILE *file; 332 + 333 + file = fopen(MAX_SKB_FRAGS_PATH, "r"); 334 + if (!file) { 335 + ksft_print_msg("Error opening %s\n", MAX_SKB_FRAGS_PATH); 336 + return 0; 337 + } 338 + 339 + if (fscanf(file, "%u", &max_skb_frags) != 1) 340 + ksft_print_msg("Error reading %s\n", MAX_SKB_FRAGS_PATH); 341 + 342 + fclose(file); 343 + return max_skb_frags; 344 + } 345 + 327 346 static struct option long_options[] = { 328 347 {"interface", required_argument, 0, 'i'}, 329 348 {"busy-poll", no_argument, 0, 'b'}, ··· 2263 2244 2264 2245 static int testapp_too_many_frags(struct test_spec *test) 2265 2246 { 2266 - struct pkt pkts[2 * XSK_DESC__MAX_SKB_FRAGS + 2] = {}; 2247 + struct pkt *pkts; 2267 2248 u32 max_frags, i; 2249 + int ret; 2268 2250 2269 - if (test->mode == TEST_MODE_ZC) 2251 + if (test->mode == TEST_MODE_ZC) { 2270 2252 max_frags = test->ifobj_tx->xdp_zc_max_segs; 2271 - else 2272 - max_frags = XSK_DESC__MAX_SKB_FRAGS; 2253 + } else { 2254 + max_frags = get_max_skb_frags(); 2255 + if (!max_frags) { 2256 + ksft_print_msg("Couldn't retrieve MAX_SKB_FRAGS from system, using default (17) value\n"); 2257 + max_frags = 17; 2258 + } 2259 + max_frags += 1; 2260 + } 2261 + 2262 + pkts = calloc(2 * max_frags + 2, sizeof(struct pkt)); 2263 + if (!pkts) 2264 + return TEST_FAILURE; 2273 2265 2274 2266 test->mtu = MAX_ETH_JUMBO_SIZE; 2275 2267 ··· 2310 2280 pkts[2 * max_frags + 1].valid = true; 2311 2281 2312 2282 pkt_stream_generate_custom(test, pkts, 2 * max_frags + 2); 2313 - return testapp_validate_traffic(test); 2283 + ret = testapp_validate_traffic(test); 2284 + 2285 + free(pkts); 2286 + return ret; 2314 2287 } 2315 2288 2316 2289 static int xsk_load_xdp_programs(struct ifobject *ifobj)
-1
tools/testing/selftests/bpf/xskxceiver.h
··· 55 55 #define XSK_UMEM__LARGE_FRAME_SIZE (3 * 1024) 56 56 #define XSK_UMEM__MAX_FRAME_SIZE (4 * 1024) 57 57 #define XSK_DESC__INVALID_OPTION (0xffff) 58 - #define XSK_DESC__MAX_SKB_FRAGS 18 59 58 #define HUGEPAGE_SIZE (2 * 1024 * 1024) 60 59 #define PKT_DUMP_NB_TO_PRINT 16 61 60 #define RUN_ALL_TESTS UINT_MAX