Merge tag 'for-netdev' of https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf

+1 -1

arch/s390/net/bpf_jit_comp.c

··· 2513 2513 return -E2BIG; 2514 2514 } 2515 2515 2516 - return ret; 2516 + return tjit.common.prg; 2517 2517 } 2518 2518 2519 2519 bool bpf_jit_supports_subprog_tailcalls(void)

+1 -1

include/linux/bpf.h

··· 1307 1307 static inline struct bpf_trampoline *bpf_trampoline_get(u64 key, 1308 1308 struct bpf_attach_target_info *tgt_info) 1309 1309 { 1310 - return ERR_PTR(-EOPNOTSUPP); 1310 + return NULL; 1311 1311 } 1312 1312 static inline void bpf_trampoline_put(struct bpf_trampoline *tr) {} 1313 1313 #define DEFINE_BPF_DISPATCHER(name)

+19 -25

kernel/bpf/memalloc.c

··· 965 965 return !ret ? NULL : ret + LLIST_NODE_SZ; 966 966 } 967 967 968 - /* Most of the logic is taken from setup_kmalloc_cache_index_table() */ 969 968 static __init int bpf_mem_cache_adjust_size(void) 970 969 { 971 - unsigned int size, index; 970 + unsigned int size; 972 971 973 - /* Normally KMALLOC_MIN_SIZE is 8-bytes, but it can be 974 - * up-to 256-bytes. 972 + /* Adjusting the indexes in size_index() according to the object_size 973 + * of underlying slab cache, so bpf_mem_alloc() will select a 974 + * bpf_mem_cache with unit_size equal to the object_size of 975 + * the underlying slab cache. 976 + * 977 + * The maximal value of KMALLOC_MIN_SIZE and __kmalloc_minalign() is 978 + * 256-bytes, so only do adjustment for [8-bytes, 192-bytes]. 975 979 */ 976 - size = KMALLOC_MIN_SIZE; 977 - if (size <= 192) 978 - index = size_index[(size - 1) / 8]; 979 - else 980 - index = fls(size - 1) - 1; 981 - for (size = 8; size < KMALLOC_MIN_SIZE && size <= 192; size += 8) 982 - size_index[(size - 1) / 8] = index; 980 + for (size = 192; size >= 8; size -= 8) { 981 + unsigned int kmalloc_size, index; 983 982 984 - /* The minimal alignment is 64-bytes, so disable 96-bytes cache and 985 - * use 128-bytes cache instead. 986 - */ 987 - if (KMALLOC_MIN_SIZE >= 64) { 988 - index = size_index[(128 - 1) / 8]; 989 - for (size = 64 + 8; size <= 96; size += 8) 990 - size_index[(size - 1) / 8] = index; 991 - } 983 + kmalloc_size = kmalloc_size_roundup(size); 984 + if (kmalloc_size == size) 985 + continue; 992 986 993 - /* The minimal alignment is 128-bytes, so disable 192-bytes cache and 994 - * use 256-bytes cache instead. 995 - */ 996 - if (KMALLOC_MIN_SIZE >= 128) { 997 - index = fls(256 - 1) - 1; 998 - for (size = 128 + 8; size <= 192; size += 8) 987 + if (kmalloc_size <= 192) 988 + index = size_index[(kmalloc_size - 1) / 8]; 989 + else 990 + index = fls(kmalloc_size - 1) - 1; 991 + /* Only overwrite if necessary */ 992 + if (size_index[(size - 1) / 8] != index) 999 993 size_index[(size - 1) / 8] = index; 1000 994 } 1001 995

+3

kernel/bpf/mprog.c

··· 253 253 goto out; 254 254 } 255 255 idx = tidx; 256 + } else if (bpf_mprog_total(entry) == bpf_mprog_max()) { 257 + ret = -ERANGE; 258 + goto out; 256 259 } 257 260 if (flags & BPF_F_BEFORE) { 258 261 tidx = bpf_mprog_pos_before(entry, &rtuple);

+3 -5

kernel/bpf/verifier.c

··· 4047 4047 bitmap_from_u64(mask, bt_reg_mask(bt)); 4048 4048 for_each_set_bit(i, mask, 32) { 4049 4049 reg = &st->frame[0]->regs[i]; 4050 - if (reg->type != SCALAR_VALUE) { 4051 - bt_clear_reg(bt, i); 4052 - continue; 4053 - } 4054 - reg->precise = true; 4050 + bt_clear_reg(bt, i); 4051 + if (reg->type == SCALAR_VALUE) 4052 + reg->precise = true; 4055 4053 } 4056 4054 return 0; 4057 4055 }

+4

net/core/sock_map.c

··· 668 668 sk = __sock_map_lookup_elem(map, key); 669 669 if (unlikely(!sk || !sock_map_redirect_allowed(sk))) 670 670 return SK_DROP; 671 + if (!(flags & BPF_F_INGRESS) && !sk_is_tcp(sk)) 672 + return SK_DROP; 671 673 672 674 msg->flags = flags; 673 675 msg->sk_redir = sk; ··· 1268 1266 1269 1267 sk = __sock_hash_lookup_elem(map, key); 1270 1268 if (unlikely(!sk || !sock_map_redirect_allowed(sk))) 1269 + return SK_DROP; 1270 + if (!(flags & BPF_F_INGRESS) && !sk_is_tcp(sk)) 1271 1271 return SK_DROP; 1272 1272 1273 1273 msg->flags = flags;

+2 -8

net/ipv4/tcp.c

··· 1621 1621 1622 1622 int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor) 1623 1623 { 1624 - struct tcp_sock *tp = tcp_sk(sk); 1625 - u32 seq = tp->copied_seq; 1626 1624 struct sk_buff *skb; 1627 1625 int copied = 0; 1628 - u32 offset; 1629 1626 1630 1627 if (sk->sk_state == TCP_LISTEN) 1631 1628 return -ENOTCONN; 1632 1629 1633 - while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) { 1630 + while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) { 1634 1631 u8 tcp_flags; 1635 1632 int used; 1636 1633 ··· 1640 1643 copied = used; 1641 1644 break; 1642 1645 } 1643 - seq += used; 1644 1646 copied += used; 1645 1647 1646 - if (tcp_flags & TCPHDR_FIN) { 1647 - ++seq; 1648 + if (tcp_flags & TCPHDR_FIN) 1648 1649 break; 1649 - } 1650 1650 } 1651 1651 return copied; 1652 1652 }

+3 -1

net/ipv4/tcp_bpf.c

··· 222 222 int *addr_len) 223 223 { 224 224 struct tcp_sock *tcp = tcp_sk(sk); 225 + int peek = flags & MSG_PEEK; 225 226 u32 seq = tcp->copied_seq; 226 227 struct sk_psock *psock; 227 228 int copied = 0; ··· 312 311 copied = -EAGAIN; 313 312 } 314 313 out: 315 - WRITE_ONCE(tcp->copied_seq, seq); 314 + if (!peek) 315 + WRITE_ONCE(tcp->copied_seq, seq); 316 316 tcp_rcv_space_adjust(sk); 317 317 if (copied > 0) 318 318 __tcp_cleanup_rbuf(sk, copied);

+2

tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c

··· 185 185 186 186 do_test("bpf_cubic", NULL); 187 187 188 + ASSERT_EQ(cubic_skel->bss->bpf_cubic_acked_called, 1, "pkts_acked called"); 189 + 188 190 bpf_link__destroy(link); 189 191 bpf_cubic__destroy(cubic_skel); 190 192 }

+51

tools/testing/selftests/bpf/prog_tests/sockmap_basic.c

··· 475 475 test_sockmap_drop_prog__destroy(drop); 476 476 } 477 477 478 + static void test_sockmap_skb_verdict_peek(void) 479 + { 480 + int err, map, verdict, s, c1, p1, zero = 0, sent, recvd, avail; 481 + struct test_sockmap_pass_prog *pass; 482 + char snd[256] = "0123456789"; 483 + char rcv[256] = "0"; 484 + 485 + pass = test_sockmap_pass_prog__open_and_load(); 486 + if (!ASSERT_OK_PTR(pass, "open_and_load")) 487 + return; 488 + verdict = bpf_program__fd(pass->progs.prog_skb_verdict); 489 + map = bpf_map__fd(pass->maps.sock_map_rx); 490 + 491 + err = bpf_prog_attach(verdict, map, BPF_SK_SKB_STREAM_VERDICT, 0); 492 + if (!ASSERT_OK(err, "bpf_prog_attach")) 493 + goto out; 494 + 495 + s = socket_loopback(AF_INET, SOCK_STREAM); 496 + if (!ASSERT_GT(s, -1, "socket_loopback(s)")) 497 + goto out; 498 + 499 + err = create_pair(s, AF_INET, SOCK_STREAM, &c1, &p1); 500 + if (!ASSERT_OK(err, "create_pairs(s)")) 501 + goto out; 502 + 503 + err = bpf_map_update_elem(map, &zero, &c1, BPF_NOEXIST); 504 + if (!ASSERT_OK(err, "bpf_map_update_elem(c1)")) 505 + goto out_close; 506 + 507 + sent = xsend(p1, snd, sizeof(snd), 0); 508 + ASSERT_EQ(sent, sizeof(snd), "xsend(p1)"); 509 + recvd = recv(c1, rcv, sizeof(rcv), MSG_PEEK); 510 + ASSERT_EQ(recvd, sizeof(rcv), "recv(c1)"); 511 + err = ioctl(c1, FIONREAD, &avail); 512 + ASSERT_OK(err, "ioctl(FIONREAD) error"); 513 + ASSERT_EQ(avail, sizeof(snd), "after peek ioctl(FIONREAD)"); 514 + recvd = recv(c1, rcv, sizeof(rcv), 0); 515 + ASSERT_EQ(recvd, sizeof(rcv), "recv(p0)"); 516 + err = ioctl(c1, FIONREAD, &avail); 517 + ASSERT_OK(err, "ioctl(FIONREAD) error"); 518 + ASSERT_EQ(avail, 0, "after read ioctl(FIONREAD)"); 519 + 520 + out_close: 521 + close(c1); 522 + close(p1); 523 + out: 524 + test_sockmap_pass_prog__destroy(pass); 525 + } 526 + 478 527 void test_sockmap_basic(void) 479 528 { 480 529 if (test__start_subtest("sockmap create_update_free")) ··· 564 515 test_sockmap_skb_verdict_fionread(true); 565 516 if (test__start_subtest("sockmap skb_verdict fionread on drop")) 566 517 test_sockmap_skb_verdict_fionread(false); 518 + if (test__start_subtest("sockmap skb_verdict msg_f_peek")) 519 + test_sockmap_skb_verdict_peek(); 567 520 }

+84

tools/testing/selftests/bpf/prog_tests/tc_opts.c

··· 2378 2378 test_tc_chain_mixed(BPF_TCX_INGRESS); 2379 2379 test_tc_chain_mixed(BPF_TCX_EGRESS); 2380 2380 } 2381 + 2382 + static int generate_dummy_prog(void) 2383 + { 2384 + const struct bpf_insn prog_insns[] = { 2385 + BPF_MOV64_IMM(BPF_REG_0, 0), 2386 + BPF_EXIT_INSN(), 2387 + }; 2388 + const size_t prog_insn_cnt = sizeof(prog_insns) / sizeof(struct bpf_insn); 2389 + LIBBPF_OPTS(bpf_prog_load_opts, opts); 2390 + const size_t log_buf_sz = 256; 2391 + char *log_buf; 2392 + int fd = -1; 2393 + 2394 + log_buf = malloc(log_buf_sz); 2395 + if (!ASSERT_OK_PTR(log_buf, "log_buf_alloc")) 2396 + return fd; 2397 + opts.log_buf = log_buf; 2398 + opts.log_size = log_buf_sz; 2399 + 2400 + log_buf[0] = '\0'; 2401 + opts.log_level = 0; 2402 + fd = bpf_prog_load(BPF_PROG_TYPE_SCHED_CLS, "tcx_prog", "GPL", 2403 + prog_insns, prog_insn_cnt, &opts); 2404 + ASSERT_STREQ(log_buf, "", "log_0"); 2405 + ASSERT_GE(fd, 0, "prog_fd"); 2406 + free(log_buf); 2407 + return fd; 2408 + } 2409 + 2410 + static void test_tc_opts_max_target(int target, int flags, bool relative) 2411 + { 2412 + int err, ifindex, i, prog_fd, last_fd = -1; 2413 + LIBBPF_OPTS(bpf_prog_attach_opts, opta); 2414 + const int max_progs = 63; 2415 + 2416 + ASSERT_OK(system("ip link add dev tcx_opts1 type veth peer name tcx_opts2"), "add veth"); 2417 + ifindex = if_nametoindex("tcx_opts1"); 2418 + ASSERT_NEQ(ifindex, 0, "non_zero_ifindex"); 2419 + 2420 + assert_mprog_count_ifindex(ifindex, target, 0); 2421 + 2422 + for (i = 0; i < max_progs; i++) { 2423 + prog_fd = generate_dummy_prog(); 2424 + if (!ASSERT_GE(prog_fd, 0, "dummy_prog")) 2425 + goto cleanup; 2426 + err = bpf_prog_attach_opts(prog_fd, ifindex, target, &opta); 2427 + if (!ASSERT_EQ(err, 0, "prog_attach")) 2428 + goto cleanup; 2429 + assert_mprog_count_ifindex(ifindex, target, i + 1); 2430 + if (i == max_progs - 1 && relative) 2431 + last_fd = prog_fd; 2432 + else 2433 + close(prog_fd); 2434 + } 2435 + 2436 + prog_fd = generate_dummy_prog(); 2437 + if (!ASSERT_GE(prog_fd, 0, "dummy_prog")) 2438 + goto cleanup; 2439 + opta.flags = flags; 2440 + if (last_fd > 0) 2441 + opta.relative_fd = last_fd; 2442 + err = bpf_prog_attach_opts(prog_fd, ifindex, target, &opta); 2443 + ASSERT_EQ(err, -ERANGE, "prog_64_attach"); 2444 + assert_mprog_count_ifindex(ifindex, target, max_progs); 2445 + close(prog_fd); 2446 + cleanup: 2447 + if (last_fd > 0) 2448 + close(last_fd); 2449 + ASSERT_OK(system("ip link del dev tcx_opts1"), "del veth"); 2450 + ASSERT_EQ(if_nametoindex("tcx_opts1"), 0, "dev1_removed"); 2451 + ASSERT_EQ(if_nametoindex("tcx_opts2"), 0, "dev2_removed"); 2452 + } 2453 + 2454 + void serial_test_tc_opts_max(void) 2455 + { 2456 + test_tc_opts_max_target(BPF_TCX_INGRESS, 0, false); 2457 + test_tc_opts_max_target(BPF_TCX_EGRESS, 0, false); 2458 + 2459 + test_tc_opts_max_target(BPF_TCX_INGRESS, BPF_F_BEFORE, false); 2460 + test_tc_opts_max_target(BPF_TCX_EGRESS, BPF_F_BEFORE, true); 2461 + 2462 + test_tc_opts_max_target(BPF_TCX_INGRESS, BPF_F_AFTER, true); 2463 + test_tc_opts_max_target(BPF_TCX_EGRESS, BPF_F_AFTER, false); 2464 + }

+3

tools/testing/selftests/bpf/progs/bpf_cubic.c

··· 490 490 } 491 491 } 492 492 493 + int bpf_cubic_acked_called = 0; 494 + 493 495 void BPF_STRUCT_OPS(bpf_cubic_acked, struct sock *sk, 494 496 const struct ack_sample *sample) 495 497 { ··· 499 497 struct bictcp *ca = inet_csk_ca(sk); 500 498 __u32 delay; 501 499 500 + bpf_cubic_acked_called = 1; 502 501 /* Some calls are for duplicates without timetamps */ 503 502 if (sample->rtt_us < 0) 504 503 return;