Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

packet: add classic BPF fanout mode

Add fanout mode PACKET_FANOUT_CBPF that accepts a classic BPF program
to select a socket.

This avoids having to keep adding special case fanout modes. One
example use case is application layer load balancing. The QUIC
protocol, for instance, encodes a connection ID in UDP payload.

Also add socket option SOL_PACKET/PACKET_FANOUT_DATA that updates data
associated with the socket group. Fanout mode PACKET_FANOUT_CBPF is the
only user so far.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

Willem de Bruijn and committed by
David S. Miller
47dceb8e a1c234f9

+104 -2
+2
include/uapi/linux/if_packet.h
··· 55 55 #define PACKET_TX_HAS_OFF 19 56 56 #define PACKET_QDISC_BYPASS 20 57 57 #define PACKET_ROLLOVER_STATS 21 58 + #define PACKET_FANOUT_DATA 22 58 59 59 60 #define PACKET_FANOUT_HASH 0 60 61 #define PACKET_FANOUT_LB 1 ··· 63 62 #define PACKET_FANOUT_ROLLOVER 3 64 63 #define PACKET_FANOUT_RND 4 65 64 #define PACKET_FANOUT_QM 5 65 + #define PACKET_FANOUT_CBPF 6 66 66 #define PACKET_FANOUT_FLAG_ROLLOVER 0x1000 67 67 #define PACKET_FANOUT_FLAG_DEFRAG 0x8000 68 68
+98 -1
net/packet/af_packet.c
··· 92 92 #ifdef CONFIG_INET 93 93 #include <net/inet_common.h> 94 94 #endif 95 + #include <linux/bpf.h> 95 96 96 97 #include "internal.h" 97 98 ··· 1411 1410 return skb_get_queue_mapping(skb) % num; 1412 1411 } 1413 1412 1413 + static unsigned int fanout_demux_bpf(struct packet_fanout *f, 1414 + struct sk_buff *skb, 1415 + unsigned int num) 1416 + { 1417 + struct bpf_prog *prog; 1418 + unsigned int ret = 0; 1419 + 1420 + rcu_read_lock(); 1421 + prog = rcu_dereference(f->bpf_prog); 1422 + if (prog) 1423 + ret = BPF_PROG_RUN(prog, skb) % num; 1424 + rcu_read_unlock(); 1425 + 1426 + return ret; 1427 + } 1428 + 1414 1429 static bool fanout_has_flag(struct packet_fanout *f, u16 flag) 1415 1430 { 1416 1431 return f->flags & (flag >> 8); ··· 1470 1453 break; 1471 1454 case PACKET_FANOUT_ROLLOVER: 1472 1455 idx = fanout_demux_rollover(f, skb, 0, false, num); 1456 + break; 1457 + case PACKET_FANOUT_CBPF: 1458 + idx = fanout_demux_bpf(f, skb, num); 1473 1459 break; 1474 1460 } 1475 1461 ··· 1522 1502 return false; 1523 1503 } 1524 1504 1505 + static void fanout_init_data(struct packet_fanout *f) 1506 + { 1507 + switch (f->type) { 1508 + case PACKET_FANOUT_LB: 1509 + atomic_set(&f->rr_cur, 0); 1510 + break; 1511 + case PACKET_FANOUT_CBPF: 1512 + RCU_INIT_POINTER(f->bpf_prog, NULL); 1513 + break; 1514 + } 1515 + } 1516 + 1517 + static void __fanout_set_data_bpf(struct packet_fanout *f, struct bpf_prog *new) 1518 + { 1519 + struct bpf_prog *old; 1520 + 1521 + spin_lock(&f->lock); 1522 + old = rcu_dereference_protected(f->bpf_prog, lockdep_is_held(&f->lock)); 1523 + rcu_assign_pointer(f->bpf_prog, new); 1524 + spin_unlock(&f->lock); 1525 + 1526 + if (old) { 1527 + synchronize_net(); 1528 + bpf_prog_destroy(old); 1529 + } 1530 + } 1531 + 1532 + static int fanout_set_data_cbpf(struct packet_sock *po, char __user *data, 1533 + unsigned int len) 1534 + { 1535 + struct bpf_prog *new; 1536 + struct sock_fprog fprog; 1537 + int ret; 1538 + 1539 + if (sock_flag(&po->sk, SOCK_FILTER_LOCKED)) 1540 + return -EPERM; 1541 + if (len != sizeof(fprog)) 1542 + return -EINVAL; 1543 + if (copy_from_user(&fprog, data, len)) 1544 + return -EFAULT; 1545 + 1546 + ret = bpf_prog_create_from_user(&new, &fprog, NULL); 1547 + if (ret) 1548 + return ret; 1549 + 1550 + __fanout_set_data_bpf(po->fanout, new); 1551 + return 0; 1552 + } 1553 + 1554 + static int fanout_set_data(struct packet_sock *po, char __user *data, 1555 + unsigned int len) 1556 + { 1557 + switch (po->fanout->type) { 1558 + case PACKET_FANOUT_CBPF: 1559 + return fanout_set_data_cbpf(po, data, len); 1560 + default: 1561 + return -EINVAL; 1562 + }; 1563 + } 1564 + 1565 + static void fanout_release_data(struct packet_fanout *f) 1566 + { 1567 + switch (f->type) { 1568 + case PACKET_FANOUT_CBPF: 1569 + __fanout_set_data_bpf(f, NULL); 1570 + }; 1571 + } 1572 + 1525 1573 static int fanout_add(struct sock *sk, u16 id, u16 type_flags) 1526 1574 { 1527 1575 struct packet_sock *po = pkt_sk(sk); ··· 1607 1519 case PACKET_FANOUT_CPU: 1608 1520 case PACKET_FANOUT_RND: 1609 1521 case PACKET_FANOUT_QM: 1522 + case PACKET_FANOUT_CBPF: 1610 1523 break; 1611 1524 default: 1612 1525 return -EINVAL; ··· 1650 1561 match->id = id; 1651 1562 match->type = type; 1652 1563 match->flags = flags; 1653 - atomic_set(&match->rr_cur, 0); 1654 1564 INIT_LIST_HEAD(&match->list); 1655 1565 spin_lock_init(&match->lock); 1656 1566 atomic_set(&match->sk_ref, 0); 1567 + fanout_init_data(match); 1657 1568 match->prot_hook.type = po->prot_hook.type; 1658 1569 match->prot_hook.dev = po->prot_hook.dev; 1659 1570 match->prot_hook.func = packet_rcv_fanout; ··· 1699 1610 if (atomic_dec_and_test(&f->sk_ref)) { 1700 1611 list_del(&f->list); 1701 1612 dev_remove_pack(&f->prot_hook); 1613 + fanout_release_data(f); 1702 1614 kfree(f); 1703 1615 } 1704 1616 mutex_unlock(&fanout_mutex); ··· 3618 3528 return -EFAULT; 3619 3529 3620 3530 return fanout_add(sk, val & 0xffff, val >> 16); 3531 + } 3532 + case PACKET_FANOUT_DATA: 3533 + { 3534 + if (!po->fanout) 3535 + return -EINVAL; 3536 + 3537 + return fanout_set_data(po, optval, optlen); 3621 3538 } 3622 3539 case PACKET_TX_HAS_OFF: 3623 3540 {
+4 -1
net/packet/internal.h
··· 79 79 u16 id; 80 80 u8 type; 81 81 u8 flags; 82 - atomic_t rr_cur; 82 + union { 83 + atomic_t rr_cur; 84 + struct bpf_prog __rcu *bpf_prog; 85 + }; 83 86 struct list_head list; 84 87 struct sock *arr[PACKET_FANOUT_MAX]; 85 88 spinlock_t lock;