Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

net: accept UFO datagrams from tuntap and packet

Tuntap and similar devices can inject GSO packets. Accept type
VIRTIO_NET_HDR_GSO_UDP, even though not generating UFO natively.

Processes are expected to use feature negotiation such as TUNSETOFFLOAD
to detect supported offload types and refrain from injecting other
packets. This process breaks down with live migration: guest kernels
do not renegotiate flags, so destination hosts need to expose all
features that the source host does.

Partially revert the UFO removal from 182e0b6b5846~1..d9d30adf5677.
This patch introduces nearly(*) no new code to simplify verification.
It brings back verbatim tuntap UFO negotiation, VIRTIO_NET_HDR_GSO_UDP
insertion and software UFO segmentation.

It does not reinstate protocol stack support, hardware offload
(NETIF_F_UFO), SKB_GSO_UDP tunneling in SKB_GSO_SOFTWARE or reception
of VIRTIO_NET_HDR_GSO_UDP packets in tuntap.

To support SKB_GSO_UDP reappearing in the stack, also reinstate
logic in act_csum and openvswitch. Achieve equivalence with v4.13 HEAD
by squashing in commit 939912216fa8 ("net: skb_needs_check() removes
CHECKSUM_UNNECESSARY check for tx.") and reverting commit 8d63bee643f1
("net: avoid skb_warn_bad_offload false positives on UFO").

(*) To avoid having to bring back skb_shinfo(skb)->ip6_frag_id,
ipv6_proxy_select_ident is changed to return a __be32 and this is
assigned directly to the frag_hdr. Also, SKB_GSO_UDP is inserted
at the end of the enum to minimize code churn.

Tested
Booted a v4.13 guest kernel with QEMU. On a host kernel before this
patch `ethtool -k eth0` shows UFO disabled. After the patch, it is
enabled, same as on a v4.13 host kernel.

A UFO packet sent from the guest appears on the tap device:
host:
nc -l -p -u 8000 &
tcpdump -n -i tap0

guest:
dd if=/dev/zero of=payload.txt bs=1 count=2000
nc -u 192.16.1.1 8000 < payload.txt

Direct tap to tap transmission of VIRTIO_NET_HDR_GSO_UDP succeeds,
packets arriving fragmented:

./with_tap_pair.sh ./tap_send_ufo tap0 tap1
(from https://github.com/wdebruij/kerneltools/tree/master/tests)

Changes
v1 -> v2
- simplified set_offload change (review comment)
- documented test procedure

Link: http://lkml.kernel.org/r/<CAF=yD-LuUeDuL9YWPJD9ykOZ0QCjNeznPDr6whqZ9NGMNF12Mw@mail.gmail.com>
Fixes: fb652fdfe837 ("macvlan/macvtap: Remove NETIF_F_UFO advertisement.")
Reported-by: Michal Kubecek <mkubecek@suse.cz>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

Willem de Bruijn and committed by
David S. Miller
0c19f846 9e77d7a5

+209 -14
+1 -1
drivers/net/tap.c
··· 1077 1077 case TUNSETOFFLOAD: 1078 1078 /* let the user check for future flags */ 1079 1079 if (arg & ~(TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 | 1080 - TUN_F_TSO_ECN)) 1080 + TUN_F_TSO_ECN | TUN_F_UFO)) 1081 1081 return -EINVAL; 1082 1082 1083 1083 rtnl_lock();
+2
drivers/net/tun.c
··· 2370 2370 features |= NETIF_F_TSO6; 2371 2371 arg &= ~(TUN_F_TSO4|TUN_F_TSO6); 2372 2372 } 2373 + 2374 + arg &= ~TUN_F_UFO; 2373 2375 } 2374 2376 2375 2377 /* This gives the user a way to test for new features in future by
+3 -1
include/linux/netdev_features.h
··· 54 54 NETIF_F_GSO_TUNNEL_REMCSUM_BIT, /* ... TUNNEL with TSO & REMCSUM */ 55 55 NETIF_F_GSO_SCTP_BIT, /* ... SCTP fragmentation */ 56 56 NETIF_F_GSO_ESP_BIT, /* ... ESP with TSO */ 57 + NETIF_F_GSO_UDP_BIT, /* ... UFO, deprecated except tuntap */ 57 58 /**/NETIF_F_GSO_LAST = /* last bit, see GSO_MASK */ 58 - NETIF_F_GSO_ESP_BIT, 59 + NETIF_F_GSO_UDP_BIT, 59 60 60 61 NETIF_F_FCOE_CRC_BIT, /* FCoE CRC32 */ 61 62 NETIF_F_SCTP_CRC_BIT, /* SCTP checksum offload */ ··· 133 132 #define NETIF_F_GSO_TUNNEL_REMCSUM __NETIF_F(GSO_TUNNEL_REMCSUM) 134 133 #define NETIF_F_GSO_SCTP __NETIF_F(GSO_SCTP) 135 134 #define NETIF_F_GSO_ESP __NETIF_F(GSO_ESP) 135 + #define NETIF_F_GSO_UDP __NETIF_F(GSO_UDP) 136 136 #define NETIF_F_HW_VLAN_STAG_FILTER __NETIF_F(HW_VLAN_STAG_FILTER) 137 137 #define NETIF_F_HW_VLAN_STAG_RX __NETIF_F(HW_VLAN_STAG_RX) 138 138 #define NETIF_F_HW_VLAN_STAG_TX __NETIF_F(HW_VLAN_STAG_TX)
+1
include/linux/netdevice.h
··· 4140 4140 BUILD_BUG_ON(SKB_GSO_TUNNEL_REMCSUM != (NETIF_F_GSO_TUNNEL_REMCSUM >> NETIF_F_GSO_SHIFT)); 4141 4141 BUILD_BUG_ON(SKB_GSO_SCTP != (NETIF_F_GSO_SCTP >> NETIF_F_GSO_SHIFT)); 4142 4142 BUILD_BUG_ON(SKB_GSO_ESP != (NETIF_F_GSO_ESP >> NETIF_F_GSO_SHIFT)); 4143 + BUILD_BUG_ON(SKB_GSO_UDP != (NETIF_F_GSO_UDP >> NETIF_F_GSO_SHIFT)); 4143 4144 4144 4145 return (features & feature) == feature; 4145 4146 }
+2
include/linux/skbuff.h
··· 568 568 SKB_GSO_SCTP = 1 << 14, 569 569 570 570 SKB_GSO_ESP = 1 << 15, 571 + 572 + SKB_GSO_UDP = 1 << 16, 571 573 }; 572 574 573 575 #if BITS_PER_LONG > 32
+4 -1
include/linux/virtio_net.h
··· 9 9 const struct virtio_net_hdr *hdr, 10 10 bool little_endian) 11 11 { 12 - unsigned short gso_type = 0; 12 + unsigned int gso_type = 0; 13 13 14 14 if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) { 15 15 switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { ··· 18 18 break; 19 19 case VIRTIO_NET_HDR_GSO_TCPV6: 20 20 gso_type = SKB_GSO_TCPV6; 21 + break; 22 + case VIRTIO_NET_HDR_GSO_UDP: 23 + gso_type = SKB_GSO_UDP; 21 24 break; 22 25 default: 23 26 return -EINVAL;
+1
include/net/ipv6.h
··· 767 767 __be32 ipv6_select_ident(struct net *net, 768 768 const struct in6_addr *daddr, 769 769 const struct in6_addr *saddr); 770 + __be32 ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb); 770 771 771 772 int ip6_dst_hoplimit(struct dst_entry *dst); 772 773
+2 -1
net/core/dev.c
··· 2746 2746 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path) 2747 2747 { 2748 2748 if (tx_path) 2749 - return skb->ip_summed != CHECKSUM_PARTIAL; 2749 + return skb->ip_summed != CHECKSUM_PARTIAL && 2750 + skb->ip_summed != CHECKSUM_UNNECESSARY; 2750 2751 2751 2752 return skb->ip_summed == CHECKSUM_NONE; 2752 2753 }
+10 -2
net/ipv4/af_inet.c
··· 1223 1223 struct sk_buff *inet_gso_segment(struct sk_buff *skb, 1224 1224 netdev_features_t features) 1225 1225 { 1226 - bool fixedid = false, gso_partial, encap; 1226 + bool udpfrag = false, fixedid = false, gso_partial, encap; 1227 1227 struct sk_buff *segs = ERR_PTR(-EINVAL); 1228 1228 const struct net_offload *ops; 1229 + unsigned int offset = 0; 1229 1230 struct iphdr *iph; 1230 1231 int proto, tot_len; 1231 1232 int nhoff; ··· 1261 1260 segs = ERR_PTR(-EPROTONOSUPPORT); 1262 1261 1263 1262 if (!skb->encapsulation || encap) { 1263 + udpfrag = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP); 1264 1264 fixedid = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TCP_FIXEDID); 1265 1265 1266 1266 /* fixed ID is invalid if DF bit is not set */ ··· 1281 1279 skb = segs; 1282 1280 do { 1283 1281 iph = (struct iphdr *)(skb_mac_header(skb) + nhoff); 1284 - if (skb_is_gso(skb)) { 1282 + if (udpfrag) { 1283 + iph->frag_off = htons(offset >> 3); 1284 + if (skb->next) 1285 + iph->frag_off |= htons(IP_MF); 1286 + offset += skb->len - nhoff - ihl; 1287 + tot_len = skb->len - nhoff; 1288 + } else if (skb_is_gso(skb)) { 1285 1289 if (!fixedid) { 1286 1290 iph->id = htons(id); 1287 1291 id += skb_shinfo(skb)->gso_segs;
+45 -4
net/ipv4/udp_offload.c
··· 187 187 } 188 188 EXPORT_SYMBOL(skb_udp_tunnel_segment); 189 189 190 - static struct sk_buff *udp4_tunnel_segment(struct sk_buff *skb, 191 - netdev_features_t features) 190 + static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, 191 + netdev_features_t features) 192 192 { 193 193 struct sk_buff *segs = ERR_PTR(-EINVAL); 194 + unsigned int mss; 195 + __wsum csum; 196 + struct udphdr *uh; 197 + struct iphdr *iph; 194 198 195 199 if (skb->encapsulation && 196 200 (skb_shinfo(skb)->gso_type & 197 - (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM))) 201 + (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM))) { 198 202 segs = skb_udp_tunnel_segment(skb, features, false); 203 + goto out; 204 + } 199 205 206 + if (!pskb_may_pull(skb, sizeof(struct udphdr))) 207 + goto out; 208 + 209 + mss = skb_shinfo(skb)->gso_size; 210 + if (unlikely(skb->len <= mss)) 211 + goto out; 212 + 213 + /* Do software UFO. Complete and fill in the UDP checksum as 214 + * HW cannot do checksum of UDP packets sent as multiple 215 + * IP fragments. 216 + */ 217 + 218 + uh = udp_hdr(skb); 219 + iph = ip_hdr(skb); 220 + 221 + uh->check = 0; 222 + csum = skb_checksum(skb, 0, skb->len, 0); 223 + uh->check = udp_v4_check(skb->len, iph->saddr, iph->daddr, csum); 224 + if (uh->check == 0) 225 + uh->check = CSUM_MANGLED_0; 226 + 227 + skb->ip_summed = CHECKSUM_UNNECESSARY; 228 + 229 + /* If there is no outer header we can fake a checksum offload 230 + * due to the fact that we have already done the checksum in 231 + * software prior to segmenting the frame. 232 + */ 233 + if (!skb->encap_hdr_csum) 234 + features |= NETIF_F_HW_CSUM; 235 + 236 + /* Fragment the skb. IP headers of the fragments are updated in 237 + * inet_gso_segment() 238 + */ 239 + segs = skb_segment(skb, features); 240 + out: 200 241 return segs; 201 242 } 202 243 ··· 371 330 372 331 static const struct net_offload udpv4_offload = { 373 332 .callbacks = { 374 - .gso_segment = udp4_tunnel_segment, 333 + .gso_segment = udp4_ufo_fragment, 375 334 .gro_receive = udp4_gro_receive, 376 335 .gro_complete = udp4_gro_complete, 377 336 },
+31
net/ipv6/output_core.c
··· 31 31 return id; 32 32 } 33 33 34 + /* This function exists only for tap drivers that must support broken 35 + * clients requesting UFO without specifying an IPv6 fragment ID. 36 + * 37 + * This is similar to ipv6_select_ident() but we use an independent hash 38 + * seed to limit information leakage. 39 + * 40 + * The network header must be set before calling this. 41 + */ 42 + __be32 ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb) 43 + { 44 + static u32 ip6_proxy_idents_hashrnd __read_mostly; 45 + struct in6_addr buf[2]; 46 + struct in6_addr *addrs; 47 + u32 id; 48 + 49 + addrs = skb_header_pointer(skb, 50 + skb_network_offset(skb) + 51 + offsetof(struct ipv6hdr, saddr), 52 + sizeof(buf), buf); 53 + if (!addrs) 54 + return 0; 55 + 56 + net_get_random_once(&ip6_proxy_idents_hashrnd, 57 + sizeof(ip6_proxy_idents_hashrnd)); 58 + 59 + id = __ipv6_select_ident(net, ip6_proxy_idents_hashrnd, 60 + &addrs[1], &addrs[0]); 61 + return htonl(id); 62 + } 63 + EXPORT_SYMBOL_GPL(ipv6_proxy_select_ident); 64 + 34 65 __be32 ipv6_select_ident(struct net *net, 35 66 const struct in6_addr *daddr, 36 67 const struct in6_addr *saddr)
+82 -3
net/ipv6/udp_offload.c
··· 17 17 #include <net/ip6_checksum.h> 18 18 #include "ip6_offload.h" 19 19 20 - static struct sk_buff *udp6_tunnel_segment(struct sk_buff *skb, 21 - netdev_features_t features) 20 + static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, 21 + netdev_features_t features) 22 22 { 23 23 struct sk_buff *segs = ERR_PTR(-EINVAL); 24 + unsigned int mss; 25 + unsigned int unfrag_ip6hlen, unfrag_len; 26 + struct frag_hdr *fptr; 27 + u8 *packet_start, *prevhdr; 28 + u8 nexthdr; 29 + u8 frag_hdr_sz = sizeof(struct frag_hdr); 30 + __wsum csum; 31 + int tnl_hlen; 32 + int err; 33 + 34 + mss = skb_shinfo(skb)->gso_size; 35 + if (unlikely(skb->len <= mss)) 36 + goto out; 24 37 25 38 if (skb->encapsulation && skb_shinfo(skb)->gso_type & 26 39 (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM)) 27 40 segs = skb_udp_tunnel_segment(skb, features, true); 41 + else { 42 + const struct ipv6hdr *ipv6h; 43 + struct udphdr *uh; 28 44 45 + if (!pskb_may_pull(skb, sizeof(struct udphdr))) 46 + goto out; 47 + 48 + /* Do software UFO. Complete and fill in the UDP checksum as HW cannot 49 + * do checksum of UDP packets sent as multiple IP fragments. 50 + */ 51 + 52 + uh = udp_hdr(skb); 53 + ipv6h = ipv6_hdr(skb); 54 + 55 + uh->check = 0; 56 + csum = skb_checksum(skb, 0, skb->len, 0); 57 + uh->check = udp_v6_check(skb->len, &ipv6h->saddr, 58 + &ipv6h->daddr, csum); 59 + if (uh->check == 0) 60 + uh->check = CSUM_MANGLED_0; 61 + 62 + skb->ip_summed = CHECKSUM_UNNECESSARY; 63 + 64 + /* If there is no outer header we can fake a checksum offload 65 + * due to the fact that we have already done the checksum in 66 + * software prior to segmenting the frame. 67 + */ 68 + if (!skb->encap_hdr_csum) 69 + features |= NETIF_F_HW_CSUM; 70 + 71 + /* Check if there is enough headroom to insert fragment header. */ 72 + tnl_hlen = skb_tnl_header_len(skb); 73 + if (skb->mac_header < (tnl_hlen + frag_hdr_sz)) { 74 + if (gso_pskb_expand_head(skb, tnl_hlen + frag_hdr_sz)) 75 + goto out; 76 + } 77 + 78 + /* Find the unfragmentable header and shift it left by frag_hdr_sz 79 + * bytes to insert fragment header. 80 + */ 81 + err = ip6_find_1stfragopt(skb, &prevhdr); 82 + if (err < 0) 83 + return ERR_PTR(err); 84 + unfrag_ip6hlen = err; 85 + nexthdr = *prevhdr; 86 + *prevhdr = NEXTHDR_FRAGMENT; 87 + unfrag_len = (skb_network_header(skb) - skb_mac_header(skb)) + 88 + unfrag_ip6hlen + tnl_hlen; 89 + packet_start = (u8 *) skb->head + SKB_GSO_CB(skb)->mac_offset; 90 + memmove(packet_start-frag_hdr_sz, packet_start, unfrag_len); 91 + 92 + SKB_GSO_CB(skb)->mac_offset -= frag_hdr_sz; 93 + skb->mac_header -= frag_hdr_sz; 94 + skb->network_header -= frag_hdr_sz; 95 + 96 + fptr = (struct frag_hdr *)(skb_network_header(skb) + unfrag_ip6hlen); 97 + fptr->nexthdr = nexthdr; 98 + fptr->reserved = 0; 99 + fptr->identification = ipv6_proxy_select_ident(dev_net(skb->dev), skb); 100 + 101 + /* Fragment the skb. ipv6 header and the remaining fields of the 102 + * fragment header are updated in ipv6_gso_segment() 103 + */ 104 + segs = skb_segment(skb, features); 105 + } 106 + 107 + out: 29 108 return segs; 30 109 } 31 110 ··· 154 75 155 76 static const struct net_offload udpv6_offload = { 156 77 .callbacks = { 157 - .gso_segment = udp6_tunnel_segment, 78 + .gso_segment = udp6_ufo_fragment, 158 79 .gro_receive = udp6_gro_receive, 159 80 .gro_complete = udp6_gro_complete, 160 81 },
+14
net/openvswitch/datapath.c
··· 308 308 const struct dp_upcall_info *upcall_info, 309 309 uint32_t cutlen) 310 310 { 311 + unsigned short gso_type = skb_shinfo(skb)->gso_type; 312 + struct sw_flow_key later_key; 311 313 struct sk_buff *segs, *nskb; 312 314 int err; 313 315 ··· 320 318 if (segs == NULL) 321 319 return -EINVAL; 322 320 321 + if (gso_type & SKB_GSO_UDP) { 322 + /* The initial flow key extracted by ovs_flow_key_extract() 323 + * in this case is for a first fragment, so we need to 324 + * properly mark later fragments. 325 + */ 326 + later_key = *key; 327 + later_key.ip.frag = OVS_FRAG_TYPE_LATER; 328 + } 329 + 323 330 /* Queue all of the segments. */ 324 331 skb = segs; 325 332 do { 333 + if (gso_type & SKB_GSO_UDP && skb != segs) 334 + key = &later_key; 335 + 326 336 err = queue_userspace_packet(dp, skb, key, upcall_info, cutlen); 327 337 if (err) 328 338 break;
+5 -1
net/openvswitch/flow.c
··· 631 631 key->ip.frag = OVS_FRAG_TYPE_LATER; 632 632 return 0; 633 633 } 634 - if (nh->frag_off & htons(IP_MF)) 634 + if (nh->frag_off & htons(IP_MF) || 635 + skb_shinfo(skb)->gso_type & SKB_GSO_UDP) 635 636 key->ip.frag = OVS_FRAG_TYPE_FIRST; 636 637 else 637 638 key->ip.frag = OVS_FRAG_TYPE_NONE; ··· 748 747 749 748 if (key->ip.frag == OVS_FRAG_TYPE_LATER) 750 749 return 0; 750 + if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP) 751 + key->ip.frag = OVS_FRAG_TYPE_FIRST; 752 + 751 753 /* Transport layer. */ 752 754 if (key->ip.proto == NEXTHDR_TCP) { 753 755 if (tcphdr_ok(skb)) {
+6
net/sched/act_csum.c
··· 229 229 const struct iphdr *iph; 230 230 u16 ul; 231 231 232 + if (skb_is_gso(skb) && skb_shinfo(skb)->gso_type & SKB_GSO_UDP) 233 + return 1; 234 + 232 235 /* 233 236 * Support both UDP and UDPLITE checksum algorithms, Don't use 234 237 * udph->len to get the real length without any protocol check, ··· 284 281 struct udphdr *udph; 285 282 const struct ipv6hdr *ip6h; 286 283 u16 ul; 284 + 285 + if (skb_is_gso(skb) && skb_shinfo(skb)->gso_type & SKB_GSO_UDP) 286 + return 1; 287 287 288 288 /* 289 289 * Support both UDP and UDPLITE checksum algorithms, Don't use