Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

net/packet: support mergeable feature of virtio

Packet sockets, like tap, can be used as the backend for kernel vhost.
In packet sockets, virtio net header size is currently hardcoded to be
the size of struct virtio_net_hdr, which is 10 bytes; however, it is not
always the case: some virtio features, such as mrg_rxbuf, need virtio
net header to be 12-byte long.

Mergeable buffers, as a virtio feature, is worthy of supporting: packets
that are larger than one-mbuf size will be dropped in vhost worker's
handle_rx if mrg_rxbuf feature is not used, but large packets
cannot be avoided and increasing mbuf's size is not economical.

With this virtio feature enabled by virtio-user, packet sockets with
hardcoded 10-byte virtio net header will parse mac head incorrectly in
packet_snd by taking the last two bytes of virtio net header as part of
mac header.
This incorrect mac header parsing will cause packet to be dropped due to
invalid ether head checking in later under-layer device packet receiving.

By adding extra field vnet_hdr_sz with utilizing holes in struct
packet_sock to record currently used virtio net header size and supporting
extra sockopt PACKET_VNET_HDR_SZ to set specified vnet_hdr_sz, packet
sockets can know the exact length of virtio net header that virtio user
gives.
In packet_snd, tpacket_snd and packet_recvmsg, instead of using
hardcoded virtio net header size, it can get the exact vnet_hdr_sz from
corresponding packet_sock, and parse mac header correctly based on this
information to avoid the packets being mistakenly dropped.

Signed-off-by: Jianfeng Tan <henry.tjf@antgroup.com>
Co-developed-by: Anqi Shen <amy.saq@antgroup.com>
Signed-off-by: Anqi Shen <amy.saq@antgroup.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

Jianfeng Tan and committed by
David S. Miller
dfc39d40 156c9398

+60 -40
+1
include/uapi/linux/if_packet.h
··· 59 59 #define PACKET_ROLLOVER_STATS 21 60 60 #define PACKET_FANOUT_DATA 22 61 61 #define PACKET_IGNORE_OUTGOING 23 62 + #define PACKET_VNET_HDR_SZ 24 62 63 63 64 #define PACKET_FANOUT_HASH 0 64 65 #define PACKET_FANOUT_LB 1
+57 -38
net/packet/af_packet.c
··· 2090 2090 } 2091 2091 2092 2092 static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb, 2093 - size_t *len) 2093 + size_t *len, int vnet_hdr_sz) 2094 2094 { 2095 - struct virtio_net_hdr vnet_hdr; 2095 + struct virtio_net_hdr_mrg_rxbuf vnet_hdr = { .num_buffers = 0 }; 2096 2096 2097 - if (*len < sizeof(vnet_hdr)) 2097 + if (*len < vnet_hdr_sz) 2098 2098 return -EINVAL; 2099 - *len -= sizeof(vnet_hdr); 2099 + *len -= vnet_hdr_sz; 2100 2100 2101 - if (virtio_net_hdr_from_skb(skb, &vnet_hdr, vio_le(), true, 0)) 2101 + if (virtio_net_hdr_from_skb(skb, (struct virtio_net_hdr *)&vnet_hdr, vio_le(), true, 0)) 2102 2102 return -EINVAL; 2103 2103 2104 - return memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr)); 2104 + return memcpy_to_msg(msg, (void *)&vnet_hdr, vnet_hdr_sz); 2105 2105 } 2106 2106 2107 2107 /* ··· 2250 2250 __u32 ts_status; 2251 2251 bool is_drop_n_account = false; 2252 2252 unsigned int slot_id = 0; 2253 - bool do_vnet = false; 2253 + int vnet_hdr_sz = 0; 2254 2254 2255 2255 /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT. 2256 2256 * We may add members to them until current aligned size without forcing ··· 2308 2308 netoff = TPACKET_ALIGN(po->tp_hdrlen + 2309 2309 (maclen < 16 ? 16 : maclen)) + 2310 2310 po->tp_reserve; 2311 - if (packet_sock_flag(po, PACKET_SOCK_HAS_VNET_HDR)) { 2312 - netoff += sizeof(struct virtio_net_hdr); 2313 - do_vnet = true; 2314 - } 2311 + vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz); 2312 + if (vnet_hdr_sz) 2313 + netoff += vnet_hdr_sz; 2315 2314 macoff = netoff - maclen; 2316 2315 } 2317 2316 if (netoff > USHRT_MAX) { ··· 2336 2337 snaplen = po->rx_ring.frame_size - macoff; 2337 2338 if ((int)snaplen < 0) { 2338 2339 snaplen = 0; 2339 - do_vnet = false; 2340 + vnet_hdr_sz = 0; 2340 2341 } 2341 2342 } 2342 2343 } else if (unlikely(macoff + snaplen > ··· 2350 2351 if (unlikely((int)snaplen < 0)) { 2351 2352 snaplen = 0; 2352 2353 macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len; 2353 - do_vnet = false; 2354 + vnet_hdr_sz = 0; 2354 2355 } 2355 2356 } 2356 2357 spin_lock(&sk->sk_receive_queue.lock); ··· 2366 2367 __set_bit(slot_id, po->rx_ring.rx_owner_map); 2367 2368 } 2368 2369 2369 - if (do_vnet && 2370 + if (vnet_hdr_sz && 2370 2371 virtio_net_hdr_from_skb(skb, h.raw + macoff - 2371 2372 sizeof(struct virtio_net_hdr), 2372 2373 vio_le(), true, 0)) { ··· 2550 2551 } 2551 2552 2552 2553 static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len, 2553 - struct virtio_net_hdr *vnet_hdr) 2554 + struct virtio_net_hdr *vnet_hdr, int vnet_hdr_sz) 2554 2555 { 2555 - if (*len < sizeof(*vnet_hdr)) 2556 + int ret; 2557 + 2558 + if (*len < vnet_hdr_sz) 2556 2559 return -EINVAL; 2557 - *len -= sizeof(*vnet_hdr); 2560 + *len -= vnet_hdr_sz; 2558 2561 2559 2562 if (!copy_from_iter_full(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter)) 2560 2563 return -EFAULT; 2561 2564 2562 - return __packet_snd_vnet_parse(vnet_hdr, *len); 2565 + ret = __packet_snd_vnet_parse(vnet_hdr, *len); 2566 + if (ret) 2567 + return ret; 2568 + 2569 + /* move iter to point to the start of mac header */ 2570 + if (vnet_hdr_sz != sizeof(struct virtio_net_hdr)) 2571 + iov_iter_advance(&msg->msg_iter, vnet_hdr_sz - sizeof(struct virtio_net_hdr)); 2572 + 2573 + return 0; 2563 2574 } 2564 2575 2565 2576 static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, ··· 2731 2722 void *ph; 2732 2723 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name); 2733 2724 bool need_wait = !(msg->msg_flags & MSG_DONTWAIT); 2725 + int vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz); 2734 2726 unsigned char *addr = NULL; 2735 2727 int tp_len, size_max; 2736 2728 void *data; ··· 2789 2779 size_max = po->tx_ring.frame_size 2790 2780 - (po->tp_hdrlen - sizeof(struct sockaddr_ll)); 2791 2781 2792 - if ((size_max > dev->mtu + reserve + VLAN_HLEN) && 2793 - !packet_sock_flag(po, PACKET_SOCK_HAS_VNET_HDR)) 2782 + if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !vnet_hdr_sz) 2794 2783 size_max = dev->mtu + reserve + VLAN_HLEN; 2795 2784 2796 2785 reinit_completion(&po->skb_completion); ··· 2818 2809 status = TP_STATUS_SEND_REQUEST; 2819 2810 hlen = LL_RESERVED_SPACE(dev); 2820 2811 tlen = dev->needed_tailroom; 2821 - if (packet_sock_flag(po, PACKET_SOCK_HAS_VNET_HDR)) { 2812 + if (vnet_hdr_sz) { 2822 2813 vnet_hdr = data; 2823 - data += sizeof(*vnet_hdr); 2824 - tp_len -= sizeof(*vnet_hdr); 2814 + data += vnet_hdr_sz; 2815 + tp_len -= vnet_hdr_sz; 2825 2816 if (tp_len < 0 || 2826 2817 __packet_snd_vnet_parse(vnet_hdr, tp_len)) { 2827 2818 tp_len = -EINVAL; ··· 2846 2837 addr, hlen, copylen, &sockc); 2847 2838 if (likely(tp_len >= 0) && 2848 2839 tp_len > dev->mtu + reserve && 2849 - !packet_sock_flag(po, PACKET_SOCK_HAS_VNET_HDR) && 2840 + !vnet_hdr_sz && 2850 2841 !packet_extra_vlan_len_allowed(dev, skb)) 2851 2842 tp_len = -EMSGSIZE; 2852 2843 ··· 2865 2856 } 2866 2857 } 2867 2858 2868 - if (packet_sock_flag(po, PACKET_SOCK_HAS_VNET_HDR)) { 2859 + if (vnet_hdr_sz) { 2869 2860 if (virtio_net_hdr_to_skb(skb, vnet_hdr, vio_le())) { 2870 2861 tp_len = -EINVAL; 2871 2862 goto tpacket_error; ··· 2955 2946 struct virtio_net_hdr vnet_hdr = { 0 }; 2956 2947 int offset = 0; 2957 2948 struct packet_sock *po = pkt_sk(sk); 2958 - bool has_vnet_hdr = false; 2949 + int vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz); 2959 2950 int hlen, tlen, linear; 2960 2951 int extra_len = 0; 2961 2952 ··· 2999 2990 3000 2991 if (sock->type == SOCK_RAW) 3001 2992 reserve = dev->hard_header_len; 3002 - if (packet_sock_flag(po, PACKET_SOCK_HAS_VNET_HDR)) { 3003 - err = packet_snd_vnet_parse(msg, &len, &vnet_hdr); 2993 + if (vnet_hdr_sz) { 2994 + err = packet_snd_vnet_parse(msg, &len, &vnet_hdr, vnet_hdr_sz); 3004 2995 if (err) 3005 2996 goto out_unlock; 3006 - has_vnet_hdr = true; 3007 2997 } 3008 2998 3009 2999 if (unlikely(sock_flag(sk, SOCK_NOFCS))) { ··· 3072 3064 3073 3065 packet_parse_headers(skb, sock); 3074 3066 3075 - if (has_vnet_hdr) { 3067 + if (vnet_hdr_sz) { 3076 3068 err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le()); 3077 3069 if (err) 3078 3070 goto out_free; 3079 - len += sizeof(vnet_hdr); 3071 + len += vnet_hdr_sz; 3080 3072 virtio_net_hdr_set_proto(skb, &vnet_hdr); 3081 3073 } 3082 3074 ··· 3416 3408 struct sock *sk = sock->sk; 3417 3409 struct sk_buff *skb; 3418 3410 int copied, err; 3419 - int vnet_hdr_len = 0; 3411 + int vnet_hdr_len = READ_ONCE(pkt_sk(sk)->vnet_hdr_sz); 3420 3412 unsigned int origlen = 0; 3421 3413 3422 3414 err = -EINVAL; ··· 3457 3449 3458 3450 packet_rcv_try_clear_pressure(pkt_sk(sk)); 3459 3451 3460 - if (packet_sock_flag(pkt_sk(sk), PACKET_SOCK_HAS_VNET_HDR)) { 3461 - err = packet_rcv_vnet(msg, skb, &len); 3452 + if (vnet_hdr_len) { 3453 + err = packet_rcv_vnet(msg, skb, &len, vnet_hdr_len); 3462 3454 if (err) 3463 3455 goto out_free; 3464 - vnet_hdr_len = sizeof(struct virtio_net_hdr); 3465 3456 } 3466 3457 3467 3458 /* You lose any data beyond the buffer you gave. If it worries ··· 3922 3915 return 0; 3923 3916 } 3924 3917 case PACKET_VNET_HDR: 3918 + case PACKET_VNET_HDR_SZ: 3925 3919 { 3926 - int val; 3920 + int val, hdr_len; 3927 3921 3928 3922 if (sock->type != SOCK_RAW) 3929 3923 return -EINVAL; ··· 3933 3925 if (copy_from_sockptr(&val, optval, sizeof(val))) 3934 3926 return -EFAULT; 3935 3927 3928 + if (optname == PACKET_VNET_HDR_SZ) { 3929 + if (val && val != sizeof(struct virtio_net_hdr) && 3930 + val != sizeof(struct virtio_net_hdr_mrg_rxbuf)) 3931 + return -EINVAL; 3932 + hdr_len = val; 3933 + } else { 3934 + hdr_len = val ? sizeof(struct virtio_net_hdr) : 0; 3935 + } 3936 3936 lock_sock(sk); 3937 3937 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) { 3938 3938 ret = -EBUSY; 3939 3939 } else { 3940 - packet_sock_flag_set(po, PACKET_SOCK_HAS_VNET_HDR, val); 3940 + WRITE_ONCE(po->vnet_hdr_sz, hdr_len); 3941 3941 ret = 0; 3942 3942 } 3943 3943 release_sock(sk); ··· 4078 4062 val = packet_sock_flag(po, PACKET_SOCK_ORIGDEV); 4079 4063 break; 4080 4064 case PACKET_VNET_HDR: 4081 - val = packet_sock_flag(po, PACKET_SOCK_HAS_VNET_HDR); 4065 + val = !!READ_ONCE(po->vnet_hdr_sz); 4066 + break; 4067 + case PACKET_VNET_HDR_SZ: 4068 + val = READ_ONCE(po->vnet_hdr_sz); 4082 4069 break; 4083 4070 case PACKET_VERSION: 4084 4071 val = po->tp_version;
+1 -1
net/packet/diag.c
··· 27 27 pinfo.pdi_flags |= PDI_AUXDATA; 28 28 if (packet_sock_flag(po, PACKET_SOCK_ORIGDEV)) 29 29 pinfo.pdi_flags |= PDI_ORIGDEV; 30 - if (packet_sock_flag(po, PACKET_SOCK_HAS_VNET_HDR)) 30 + if (READ_ONCE(po->vnet_hdr_sz)) 31 31 pinfo.pdi_flags |= PDI_VNETHDR; 32 32 if (packet_sock_flag(po, PACKET_SOCK_TP_LOSS)) 33 33 pinfo.pdi_flags |= PDI_LOSS;
+1 -1
net/packet/internal.h
··· 118 118 struct mutex pg_vec_lock; 119 119 unsigned long flags; 120 120 int ifindex; /* bound device */ 121 + u8 vnet_hdr_sz; 121 122 __be16 num; 122 123 struct packet_rollover *rollover; 123 124 struct packet_mclist *mclist; ··· 140 139 PACKET_SOCK_AUXDATA, 141 140 PACKET_SOCK_TX_HAS_OFF, 142 141 PACKET_SOCK_TP_LOSS, 143 - PACKET_SOCK_HAS_VNET_HDR, 144 142 PACKET_SOCK_RUNNING, 145 143 PACKET_SOCK_PRESSURE, 146 144 PACKET_SOCK_QDISC_BYPASS,