Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'for-netdev' of https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf

Daniel Borkmann says:

====================
pull-request: bpf 2023-10-02

We've added 11 non-merge commits during the last 12 day(s) which contain
a total of 12 files changed, 176 insertions(+), 41 deletions(-).

The main changes are:

1) Fix BPF verifier to reset backtrack_state masks on global function
exit as otherwise subsequent precision tracking would reuse them,
from Andrii Nakryiko.

2) Several sockmap fixes for available bytes accounting,
from John Fastabend.

3) Reject sk_msg egress redirects to non-TCP sockets given this
is only supported for TCP sockets today, from Jakub Sitnicki.

4) Fix a syzkaller splat in bpf_mprog when hitting maximum program
limits with BPF_F_BEFORE directive, from Daniel Borkmann
and Nikolay Aleksandrov.

5) Fix BPF memory allocator to use kmalloc_size_roundup() to adjust
size_index for selecting a bpf_mem_cache, from Hou Tao.

6) Fix arch_prepare_bpf_trampoline return code for s390 JIT,
from Song Liu.

7) Fix bpf_trampoline_get when CONFIG_BPF_JIT is turned off,
from Leon Hwang.

* tag 'for-netdev' of https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf:
bpf: Use kmalloc_size_roundup() to adjust size_index
selftest/bpf: Add various selftests for program limits
bpf, mprog: Fix maximum program check on mprog attachment
bpf, sockmap: Reject sk_msg egress redirects to non-TCP sockets
bpf, sockmap: Add tests for MSG_F_PEEK
bpf, sockmap: Do not inc copied_seq when PEEK flag set
bpf: tcp_read_skb needs to pop skb regardless of seq
bpf: unconditionally reset backtrack_state masks on global func exit
bpf: Fix tr dereferencing
selftests/bpf: Check bpf_cubic_acked() is called via struct_ops
s390/bpf: Let arch_prepare_bpf_trampoline return program size
====================

Link: https://lore.kernel.org/r/20231002113417.2309-1-daniel@iogearbox.net
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

+176 -41
+1 -1
arch/s390/net/bpf_jit_comp.c
··· 2513 2513 return -E2BIG; 2514 2514 } 2515 2515 2516 - return ret; 2516 + return tjit.common.prg; 2517 2517 } 2518 2518 2519 2519 bool bpf_jit_supports_subprog_tailcalls(void)
+1 -1
include/linux/bpf.h
··· 1307 1307 static inline struct bpf_trampoline *bpf_trampoline_get(u64 key, 1308 1308 struct bpf_attach_target_info *tgt_info) 1309 1309 { 1310 - return ERR_PTR(-EOPNOTSUPP); 1310 + return NULL; 1311 1311 } 1312 1312 static inline void bpf_trampoline_put(struct bpf_trampoline *tr) {} 1313 1313 #define DEFINE_BPF_DISPATCHER(name)
+19 -25
kernel/bpf/memalloc.c
··· 965 965 return !ret ? NULL : ret + LLIST_NODE_SZ; 966 966 } 967 967 968 - /* Most of the logic is taken from setup_kmalloc_cache_index_table() */ 969 968 static __init int bpf_mem_cache_adjust_size(void) 970 969 { 971 - unsigned int size, index; 970 + unsigned int size; 972 971 973 - /* Normally KMALLOC_MIN_SIZE is 8-bytes, but it can be 974 - * up-to 256-bytes. 972 + /* Adjusting the indexes in size_index() according to the object_size 973 + * of underlying slab cache, so bpf_mem_alloc() will select a 974 + * bpf_mem_cache with unit_size equal to the object_size of 975 + * the underlying slab cache. 976 + * 977 + * The maximal value of KMALLOC_MIN_SIZE and __kmalloc_minalign() is 978 + * 256-bytes, so only do adjustment for [8-bytes, 192-bytes]. 975 979 */ 976 - size = KMALLOC_MIN_SIZE; 977 - if (size <= 192) 978 - index = size_index[(size - 1) / 8]; 979 - else 980 - index = fls(size - 1) - 1; 981 - for (size = 8; size < KMALLOC_MIN_SIZE && size <= 192; size += 8) 982 - size_index[(size - 1) / 8] = index; 980 + for (size = 192; size >= 8; size -= 8) { 981 + unsigned int kmalloc_size, index; 983 982 984 - /* The minimal alignment is 64-bytes, so disable 96-bytes cache and 985 - * use 128-bytes cache instead. 986 - */ 987 - if (KMALLOC_MIN_SIZE >= 64) { 988 - index = size_index[(128 - 1) / 8]; 989 - for (size = 64 + 8; size <= 96; size += 8) 990 - size_index[(size - 1) / 8] = index; 991 - } 983 + kmalloc_size = kmalloc_size_roundup(size); 984 + if (kmalloc_size == size) 985 + continue; 992 986 993 - /* The minimal alignment is 128-bytes, so disable 192-bytes cache and 994 - * use 256-bytes cache instead. 995 - */ 996 - if (KMALLOC_MIN_SIZE >= 128) { 997 - index = fls(256 - 1) - 1; 998 - for (size = 128 + 8; size <= 192; size += 8) 987 + if (kmalloc_size <= 192) 988 + index = size_index[(kmalloc_size - 1) / 8]; 989 + else 990 + index = fls(kmalloc_size - 1) - 1; 991 + /* Only overwrite if necessary */ 992 + if (size_index[(size - 1) / 8] != index) 999 993 size_index[(size - 1) / 8] = index; 1000 994 } 1001 995
+3
kernel/bpf/mprog.c
··· 253 253 goto out; 254 254 } 255 255 idx = tidx; 256 + } else if (bpf_mprog_total(entry) == bpf_mprog_max()) { 257 + ret = -ERANGE; 258 + goto out; 256 259 } 257 260 if (flags & BPF_F_BEFORE) { 258 261 tidx = bpf_mprog_pos_before(entry, &rtuple);
+3 -5
kernel/bpf/verifier.c
··· 4047 4047 bitmap_from_u64(mask, bt_reg_mask(bt)); 4048 4048 for_each_set_bit(i, mask, 32) { 4049 4049 reg = &st->frame[0]->regs[i]; 4050 - if (reg->type != SCALAR_VALUE) { 4051 - bt_clear_reg(bt, i); 4052 - continue; 4053 - } 4054 - reg->precise = true; 4050 + bt_clear_reg(bt, i); 4051 + if (reg->type == SCALAR_VALUE) 4052 + reg->precise = true; 4055 4053 } 4056 4054 return 0; 4057 4055 }
+4
net/core/sock_map.c
··· 668 668 sk = __sock_map_lookup_elem(map, key); 669 669 if (unlikely(!sk || !sock_map_redirect_allowed(sk))) 670 670 return SK_DROP; 671 + if (!(flags & BPF_F_INGRESS) && !sk_is_tcp(sk)) 672 + return SK_DROP; 671 673 672 674 msg->flags = flags; 673 675 msg->sk_redir = sk; ··· 1268 1266 1269 1267 sk = __sock_hash_lookup_elem(map, key); 1270 1268 if (unlikely(!sk || !sock_map_redirect_allowed(sk))) 1269 + return SK_DROP; 1270 + if (!(flags & BPF_F_INGRESS) && !sk_is_tcp(sk)) 1271 1271 return SK_DROP; 1272 1272 1273 1273 msg->flags = flags;
+2 -8
net/ipv4/tcp.c
··· 1621 1621 1622 1622 int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor) 1623 1623 { 1624 - struct tcp_sock *tp = tcp_sk(sk); 1625 - u32 seq = tp->copied_seq; 1626 1624 struct sk_buff *skb; 1627 1625 int copied = 0; 1628 - u32 offset; 1629 1626 1630 1627 if (sk->sk_state == TCP_LISTEN) 1631 1628 return -ENOTCONN; 1632 1629 1633 - while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) { 1630 + while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) { 1634 1631 u8 tcp_flags; 1635 1632 int used; 1636 1633 ··· 1640 1643 copied = used; 1641 1644 break; 1642 1645 } 1643 - seq += used; 1644 1646 copied += used; 1645 1647 1646 - if (tcp_flags & TCPHDR_FIN) { 1647 - ++seq; 1648 + if (tcp_flags & TCPHDR_FIN) 1648 1649 break; 1649 - } 1650 1650 } 1651 1651 return copied; 1652 1652 }
+3 -1
net/ipv4/tcp_bpf.c
··· 222 222 int *addr_len) 223 223 { 224 224 struct tcp_sock *tcp = tcp_sk(sk); 225 + int peek = flags & MSG_PEEK; 225 226 u32 seq = tcp->copied_seq; 226 227 struct sk_psock *psock; 227 228 int copied = 0; ··· 312 311 copied = -EAGAIN; 313 312 } 314 313 out: 315 - WRITE_ONCE(tcp->copied_seq, seq); 314 + if (!peek) 315 + WRITE_ONCE(tcp->copied_seq, seq); 316 316 tcp_rcv_space_adjust(sk); 317 317 if (copied > 0) 318 318 __tcp_cleanup_rbuf(sk, copied);
+2
tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
··· 185 185 186 186 do_test("bpf_cubic", NULL); 187 187 188 + ASSERT_EQ(cubic_skel->bss->bpf_cubic_acked_called, 1, "pkts_acked called"); 189 + 188 190 bpf_link__destroy(link); 189 191 bpf_cubic__destroy(cubic_skel); 190 192 }
+51
tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
··· 475 475 test_sockmap_drop_prog__destroy(drop); 476 476 } 477 477 478 + static void test_sockmap_skb_verdict_peek(void) 479 + { 480 + int err, map, verdict, s, c1, p1, zero = 0, sent, recvd, avail; 481 + struct test_sockmap_pass_prog *pass; 482 + char snd[256] = "0123456789"; 483 + char rcv[256] = "0"; 484 + 485 + pass = test_sockmap_pass_prog__open_and_load(); 486 + if (!ASSERT_OK_PTR(pass, "open_and_load")) 487 + return; 488 + verdict = bpf_program__fd(pass->progs.prog_skb_verdict); 489 + map = bpf_map__fd(pass->maps.sock_map_rx); 490 + 491 + err = bpf_prog_attach(verdict, map, BPF_SK_SKB_STREAM_VERDICT, 0); 492 + if (!ASSERT_OK(err, "bpf_prog_attach")) 493 + goto out; 494 + 495 + s = socket_loopback(AF_INET, SOCK_STREAM); 496 + if (!ASSERT_GT(s, -1, "socket_loopback(s)")) 497 + goto out; 498 + 499 + err = create_pair(s, AF_INET, SOCK_STREAM, &c1, &p1); 500 + if (!ASSERT_OK(err, "create_pairs(s)")) 501 + goto out; 502 + 503 + err = bpf_map_update_elem(map, &zero, &c1, BPF_NOEXIST); 504 + if (!ASSERT_OK(err, "bpf_map_update_elem(c1)")) 505 + goto out_close; 506 + 507 + sent = xsend(p1, snd, sizeof(snd), 0); 508 + ASSERT_EQ(sent, sizeof(snd), "xsend(p1)"); 509 + recvd = recv(c1, rcv, sizeof(rcv), MSG_PEEK); 510 + ASSERT_EQ(recvd, sizeof(rcv), "recv(c1)"); 511 + err = ioctl(c1, FIONREAD, &avail); 512 + ASSERT_OK(err, "ioctl(FIONREAD) error"); 513 + ASSERT_EQ(avail, sizeof(snd), "after peek ioctl(FIONREAD)"); 514 + recvd = recv(c1, rcv, sizeof(rcv), 0); 515 + ASSERT_EQ(recvd, sizeof(rcv), "recv(p0)"); 516 + err = ioctl(c1, FIONREAD, &avail); 517 + ASSERT_OK(err, "ioctl(FIONREAD) error"); 518 + ASSERT_EQ(avail, 0, "after read ioctl(FIONREAD)"); 519 + 520 + out_close: 521 + close(c1); 522 + close(p1); 523 + out: 524 + test_sockmap_pass_prog__destroy(pass); 525 + } 526 + 478 527 void test_sockmap_basic(void) 479 528 { 480 529 if (test__start_subtest("sockmap create_update_free")) ··· 564 515 test_sockmap_skb_verdict_fionread(true); 565 516 if (test__start_subtest("sockmap skb_verdict fionread on drop")) 566 517 test_sockmap_skb_verdict_fionread(false); 518 + if (test__start_subtest("sockmap skb_verdict msg_f_peek")) 519 + test_sockmap_skb_verdict_peek(); 567 520 }
+84
tools/testing/selftests/bpf/prog_tests/tc_opts.c
··· 2378 2378 test_tc_chain_mixed(BPF_TCX_INGRESS); 2379 2379 test_tc_chain_mixed(BPF_TCX_EGRESS); 2380 2380 } 2381 + 2382 + static int generate_dummy_prog(void) 2383 + { 2384 + const struct bpf_insn prog_insns[] = { 2385 + BPF_MOV64_IMM(BPF_REG_0, 0), 2386 + BPF_EXIT_INSN(), 2387 + }; 2388 + const size_t prog_insn_cnt = sizeof(prog_insns) / sizeof(struct bpf_insn); 2389 + LIBBPF_OPTS(bpf_prog_load_opts, opts); 2390 + const size_t log_buf_sz = 256; 2391 + char *log_buf; 2392 + int fd = -1; 2393 + 2394 + log_buf = malloc(log_buf_sz); 2395 + if (!ASSERT_OK_PTR(log_buf, "log_buf_alloc")) 2396 + return fd; 2397 + opts.log_buf = log_buf; 2398 + opts.log_size = log_buf_sz; 2399 + 2400 + log_buf[0] = '\0'; 2401 + opts.log_level = 0; 2402 + fd = bpf_prog_load(BPF_PROG_TYPE_SCHED_CLS, "tcx_prog", "GPL", 2403 + prog_insns, prog_insn_cnt, &opts); 2404 + ASSERT_STREQ(log_buf, "", "log_0"); 2405 + ASSERT_GE(fd, 0, "prog_fd"); 2406 + free(log_buf); 2407 + return fd; 2408 + } 2409 + 2410 + static void test_tc_opts_max_target(int target, int flags, bool relative) 2411 + { 2412 + int err, ifindex, i, prog_fd, last_fd = -1; 2413 + LIBBPF_OPTS(bpf_prog_attach_opts, opta); 2414 + const int max_progs = 63; 2415 + 2416 + ASSERT_OK(system("ip link add dev tcx_opts1 type veth peer name tcx_opts2"), "add veth"); 2417 + ifindex = if_nametoindex("tcx_opts1"); 2418 + ASSERT_NEQ(ifindex, 0, "non_zero_ifindex"); 2419 + 2420 + assert_mprog_count_ifindex(ifindex, target, 0); 2421 + 2422 + for (i = 0; i < max_progs; i++) { 2423 + prog_fd = generate_dummy_prog(); 2424 + if (!ASSERT_GE(prog_fd, 0, "dummy_prog")) 2425 + goto cleanup; 2426 + err = bpf_prog_attach_opts(prog_fd, ifindex, target, &opta); 2427 + if (!ASSERT_EQ(err, 0, "prog_attach")) 2428 + goto cleanup; 2429 + assert_mprog_count_ifindex(ifindex, target, i + 1); 2430 + if (i == max_progs - 1 && relative) 2431 + last_fd = prog_fd; 2432 + else 2433 + close(prog_fd); 2434 + } 2435 + 2436 + prog_fd = generate_dummy_prog(); 2437 + if (!ASSERT_GE(prog_fd, 0, "dummy_prog")) 2438 + goto cleanup; 2439 + opta.flags = flags; 2440 + if (last_fd > 0) 2441 + opta.relative_fd = last_fd; 2442 + err = bpf_prog_attach_opts(prog_fd, ifindex, target, &opta); 2443 + ASSERT_EQ(err, -ERANGE, "prog_64_attach"); 2444 + assert_mprog_count_ifindex(ifindex, target, max_progs); 2445 + close(prog_fd); 2446 + cleanup: 2447 + if (last_fd > 0) 2448 + close(last_fd); 2449 + ASSERT_OK(system("ip link del dev tcx_opts1"), "del veth"); 2450 + ASSERT_EQ(if_nametoindex("tcx_opts1"), 0, "dev1_removed"); 2451 + ASSERT_EQ(if_nametoindex("tcx_opts2"), 0, "dev2_removed"); 2452 + } 2453 + 2454 + void serial_test_tc_opts_max(void) 2455 + { 2456 + test_tc_opts_max_target(BPF_TCX_INGRESS, 0, false); 2457 + test_tc_opts_max_target(BPF_TCX_EGRESS, 0, false); 2458 + 2459 + test_tc_opts_max_target(BPF_TCX_INGRESS, BPF_F_BEFORE, false); 2460 + test_tc_opts_max_target(BPF_TCX_EGRESS, BPF_F_BEFORE, true); 2461 + 2462 + test_tc_opts_max_target(BPF_TCX_INGRESS, BPF_F_AFTER, true); 2463 + test_tc_opts_max_target(BPF_TCX_EGRESS, BPF_F_AFTER, false); 2464 + }
+3
tools/testing/selftests/bpf/progs/bpf_cubic.c
··· 490 490 } 491 491 } 492 492 493 + int bpf_cubic_acked_called = 0; 494 + 493 495 void BPF_STRUCT_OPS(bpf_cubic_acked, struct sock *sk, 494 496 const struct ack_sample *sample) 495 497 { ··· 499 497 struct bictcp *ca = inet_csk_ca(sk); 500 498 __u32 delay; 501 499 500 + bpf_cubic_acked_called = 1; 502 501 /* Some calls are for duplicates without timetamps */ 503 502 if (sample->rtt_us < 0) 504 503 return;