Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

GSO: Support partial segmentation offload

This patch adds support for something I am referring to as GSO partial.
The basic idea is that we can support a broader range of devices for
segmentation if we use fixed outer headers and have the hardware only
really deal with segmenting the inner header. The idea behind the naming
is due to the fact that everything before csum_start will be fixed headers,
and everything after will be the region that is handled by hardware.

With the current implementation it allows us to add support for the
following GSO types with an inner TSO_MANGLEID or TSO6 offload:
NETIF_F_GSO_GRE
NETIF_F_GSO_GRE_CSUM
NETIF_F_GSO_IPIP
NETIF_F_GSO_SIT
NETIF_F_UDP_TUNNEL
NETIF_F_UDP_TUNNEL_CSUM

In the case of hardware that already supports tunneling we may be able to
extend this further to support TSO_TCPV4 without TSO_MANGLEID if the
hardware can support updating inner IPv4 headers.

Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

Alexander Duyck and committed by
David S. Miller
802ab55a 1530545e

+151 -24
+5
include/linux/netdev_features.h
··· 48 48 NETIF_F_GSO_SIT_BIT, /* ... SIT tunnel with TSO */ 49 49 NETIF_F_GSO_UDP_TUNNEL_BIT, /* ... UDP TUNNEL with TSO */ 50 50 NETIF_F_GSO_UDP_TUNNEL_CSUM_BIT,/* ... UDP TUNNEL with TSO & CSUM */ 51 + NETIF_F_GSO_PARTIAL_BIT, /* ... Only segment inner-most L4 52 + * in hardware and all other 53 + * headers in software. 54 + */ 51 55 NETIF_F_GSO_TUNNEL_REMCSUM_BIT, /* ... TUNNEL with TSO & REMCSUM */ 52 56 /**/NETIF_F_GSO_LAST = /* last bit, see GSO_MASK */ 53 57 NETIF_F_GSO_TUNNEL_REMCSUM_BIT, ··· 126 122 #define NETIF_F_GSO_UDP_TUNNEL __NETIF_F(GSO_UDP_TUNNEL) 127 123 #define NETIF_F_GSO_UDP_TUNNEL_CSUM __NETIF_F(GSO_UDP_TUNNEL_CSUM) 128 124 #define NETIF_F_TSO_MANGLEID __NETIF_F(TSO_MANGLEID) 125 + #define NETIF_F_GSO_PARTIAL __NETIF_F(GSO_PARTIAL) 129 126 #define NETIF_F_GSO_TUNNEL_REMCSUM __NETIF_F(GSO_TUNNEL_REMCSUM) 130 127 #define NETIF_F_HW_VLAN_STAG_FILTER __NETIF_F(HW_VLAN_STAG_FILTER) 131 128 #define NETIF_F_HW_VLAN_STAG_RX __NETIF_F(HW_VLAN_STAG_RX)
+2
include/linux/netdevice.h
··· 1654 1654 netdev_features_t vlan_features; 1655 1655 netdev_features_t hw_enc_features; 1656 1656 netdev_features_t mpls_features; 1657 + netdev_features_t gso_partial_features; 1657 1658 1658 1659 int ifindex; 1659 1660 int group; ··· 4005 4004 BUILD_BUG_ON(SKB_GSO_SIT != (NETIF_F_GSO_SIT >> NETIF_F_GSO_SHIFT)); 4006 4005 BUILD_BUG_ON(SKB_GSO_UDP_TUNNEL != (NETIF_F_GSO_UDP_TUNNEL >> NETIF_F_GSO_SHIFT)); 4007 4006 BUILD_BUG_ON(SKB_GSO_UDP_TUNNEL_CSUM != (NETIF_F_GSO_UDP_TUNNEL_CSUM >> NETIF_F_GSO_SHIFT)); 4007 + BUILD_BUG_ON(SKB_GSO_PARTIAL != (NETIF_F_GSO_PARTIAL >> NETIF_F_GSO_SHIFT)); 4008 4008 BUILD_BUG_ON(SKB_GSO_TUNNEL_REMCSUM != (NETIF_F_GSO_TUNNEL_REMCSUM >> NETIF_F_GSO_SHIFT)); 4009 4009 4010 4010 return (features & feature) == feature;
+7 -2
include/linux/skbuff.h
··· 483 483 484 484 SKB_GSO_UDP_TUNNEL_CSUM = 1 << 12, 485 485 486 - SKB_GSO_TUNNEL_REMCSUM = 1 << 13, 486 + SKB_GSO_PARTIAL = 1 << 13, 487 + 488 + SKB_GSO_TUNNEL_REMCSUM = 1 << 14, 487 489 }; 488 490 489 491 #if BITS_PER_LONG > 32 ··· 3593 3591 * Keeps track of level of encapsulation of network headers. 3594 3592 */ 3595 3593 struct skb_gso_cb { 3596 - int mac_offset; 3594 + union { 3595 + int mac_offset; 3596 + int data_offset; 3597 + }; 3597 3598 int encap_level; 3598 3599 __wsum csum; 3599 3600 __u16 csum_start;
+33 -3
net/core/dev.c
··· 2711 2711 return ERR_PTR(err); 2712 2712 } 2713 2713 2714 + /* Only report GSO partial support if it will enable us to 2715 + * support segmentation on this frame without needing additional 2716 + * work. 2717 + */ 2718 + if (features & NETIF_F_GSO_PARTIAL) { 2719 + netdev_features_t partial_features = NETIF_F_GSO_ROBUST; 2720 + struct net_device *dev = skb->dev; 2721 + 2722 + partial_features |= dev->features & dev->gso_partial_features; 2723 + if (!skb_gso_ok(skb, features | partial_features)) 2724 + features &= ~NETIF_F_GSO_PARTIAL; 2725 + } 2726 + 2714 2727 BUILD_BUG_ON(SKB_SGO_CB_OFFSET + 2715 2728 sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb)); 2716 2729 ··· 2847 2834 if (gso_segs > dev->gso_max_segs) 2848 2835 return features & ~NETIF_F_GSO_MASK; 2849 2836 2850 - /* Make sure to clear the IPv4 ID mangling feature if 2851 - * the IPv4 header has the potential to be fragmented. 2837 + /* Support for GSO partial features requires software 2838 + * intervention before we can actually process the packets 2839 + * so we need to strip support for any partial features now 2840 + * and we can pull them back in after we have partially 2841 + * segmented the frame. 2842 + */ 2843 + if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL)) 2844 + features &= ~dev->gso_partial_features; 2845 + 2846 + /* Make sure to clear the IPv4 ID mangling feature if the 2847 + * IPv4 header has the potential to be fragmented. 2852 2848 */ 2853 2849 if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) { 2854 2850 struct iphdr *iph = skb->encapsulation ? ··· 6751 6729 } 6752 6730 } 6753 6731 6732 + /* GSO partial features require GSO partial be set */ 6733 + if ((features & dev->gso_partial_features) && 6734 + !(features & NETIF_F_GSO_PARTIAL)) { 6735 + netdev_dbg(dev, 6736 + "Dropping partially supported GSO features since no GSO partial.\n"); 6737 + features &= ~dev->gso_partial_features; 6738 + } 6739 + 6754 6740 #ifdef CONFIG_NET_RX_BUSY_POLL 6755 6741 if (dev->netdev_ops->ndo_busy_poll) 6756 6742 features |= NETIF_F_BUSY_POLL; ··· 7041 7011 7042 7012 /* Make NETIF_F_SG inheritable to tunnel devices. 7043 7013 */ 7044 - dev->hw_enc_features |= NETIF_F_SG; 7014 + dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL; 7045 7015 7046 7016 /* Make NETIF_F_SG inheritable to MPLS. 7047 7017 */
+1
net/core/ethtool.c
··· 88 88 [NETIF_F_GSO_SIT_BIT] = "tx-sit-segmentation", 89 89 [NETIF_F_GSO_UDP_TUNNEL_BIT] = "tx-udp_tnl-segmentation", 90 90 [NETIF_F_GSO_UDP_TUNNEL_CSUM_BIT] = "tx-udp_tnl-csum-segmentation", 91 + [NETIF_F_GSO_PARTIAL_BIT] = "tx-gso-partial", 91 92 92 93 [NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc", 93 94 [NETIF_F_SCTP_CRC_BIT] = "tx-checksum-sctp",
+28 -1
net/core/skbuff.c
··· 3076 3076 struct sk_buff *frag_skb = head_skb; 3077 3077 unsigned int offset = doffset; 3078 3078 unsigned int tnl_hlen = skb_tnl_header_len(head_skb); 3079 + unsigned int partial_segs = 0; 3079 3080 unsigned int headroom; 3080 - unsigned int len; 3081 + unsigned int len = head_skb->len; 3081 3082 __be16 proto; 3082 3083 bool csum; 3083 3084 int sg = !!(features & NETIF_F_SG); ··· 3094 3093 return ERR_PTR(-EINVAL); 3095 3094 3096 3095 csum = !!can_checksum_protocol(features, proto); 3096 + 3097 + /* GSO partial only requires that we trim off any excess that 3098 + * doesn't fit into an MSS sized block, so take care of that 3099 + * now. 3100 + */ 3101 + if (features & NETIF_F_GSO_PARTIAL) { 3102 + partial_segs = len / mss; 3103 + mss *= partial_segs; 3104 + } 3097 3105 3098 3106 headroom = skb_headroom(head_skb); 3099 3107 pos = skb_headlen(head_skb); ··· 3290 3280 * (see validate_xmit_skb_list() for example) 3291 3281 */ 3292 3282 segs->prev = tail; 3283 + 3284 + /* Update GSO info on first skb in partial sequence. */ 3285 + if (partial_segs) { 3286 + int type = skb_shinfo(head_skb)->gso_type; 3287 + 3288 + /* Update type to add partial and then remove dodgy if set */ 3289 + type |= SKB_GSO_PARTIAL; 3290 + type &= ~SKB_GSO_DODGY; 3291 + 3292 + /* Update GSO info and prepare to start updating headers on 3293 + * our way back down the stack of protocols. 3294 + */ 3295 + skb_shinfo(segs)->gso_size = skb_shinfo(head_skb)->gso_size; 3296 + skb_shinfo(segs)->gso_segs = partial_segs; 3297 + skb_shinfo(segs)->gso_type = type; 3298 + SKB_GSO_CB(segs)->data_offset = skb_headroom(segs) + doffset; 3299 + } 3293 3300 3294 3301 /* Following permits correct backpressure, for protocols 3295 3302 * using skb_set_owner_w().
+16 -4
net/ipv4/af_inet.c
··· 1200 1200 const struct net_offload *ops; 1201 1201 unsigned int offset = 0; 1202 1202 struct iphdr *iph; 1203 - int proto; 1203 + int proto, tot_len; 1204 1204 int nhoff; 1205 1205 int ihl; 1206 1206 int id; ··· 1219 1219 SKB_GSO_UDP_TUNNEL_CSUM | 1220 1220 SKB_GSO_TCP_FIXEDID | 1221 1221 SKB_GSO_TUNNEL_REMCSUM | 1222 + SKB_GSO_PARTIAL | 1222 1223 0))) 1223 1224 goto out; 1224 1225 ··· 1274 1273 if (skb->next) 1275 1274 iph->frag_off |= htons(IP_MF); 1276 1275 offset += skb->len - nhoff - ihl; 1277 - } else if (!fixedid) { 1278 - iph->id = htons(id++); 1276 + tot_len = skb->len - nhoff; 1277 + } else if (skb_is_gso(skb)) { 1278 + if (!fixedid) { 1279 + iph->id = htons(id); 1280 + id += skb_shinfo(skb)->gso_segs; 1281 + } 1282 + tot_len = skb_shinfo(skb)->gso_size + 1283 + SKB_GSO_CB(skb)->data_offset + 1284 + skb->head - (unsigned char *)iph; 1285 + } else { 1286 + if (!fixedid) 1287 + iph->id = htons(id++); 1288 + tot_len = skb->len - nhoff; 1279 1289 } 1280 - iph->tot_len = htons(skb->len - nhoff); 1290 + iph->tot_len = htons(tot_len); 1281 1291 ip_send_check(iph); 1282 1292 if (encap) 1283 1293 skb_reset_inner_headers(skb);
+21 -5
net/ipv4/gre_offload.c
··· 36 36 SKB_GSO_GRE | 37 37 SKB_GSO_GRE_CSUM | 38 38 SKB_GSO_IPIP | 39 - SKB_GSO_SIT))) 39 + SKB_GSO_SIT | 40 + SKB_GSO_PARTIAL))) 40 41 goto out; 41 42 42 43 if (!skb->encapsulation) ··· 88 87 skb = segs; 89 88 do { 90 89 struct gre_base_hdr *greh; 91 - __be32 *pcsum; 90 + __sum16 *pcsum; 92 91 93 92 /* Set up inner headers if we are offloading inner checksum */ 94 93 if (skb->ip_summed == CHECKSUM_PARTIAL) { ··· 108 107 continue; 109 108 110 109 greh = (struct gre_base_hdr *)skb_transport_header(skb); 111 - pcsum = (__be32 *)(greh + 1); 110 + pcsum = (__sum16 *)(greh + 1); 112 111 113 - *pcsum = 0; 114 - *(__sum16 *)pcsum = gso_make_checksum(skb, 0); 112 + if (skb_is_gso(skb)) { 113 + unsigned int partial_adj; 114 + 115 + /* Adjust checksum to account for the fact that 116 + * the partial checksum is based on actual size 117 + * whereas headers should be based on MSS size. 118 + */ 119 + partial_adj = skb->len + skb_headroom(skb) - 120 + SKB_GSO_CB(skb)->data_offset - 121 + skb_shinfo(skb)->gso_size; 122 + *pcsum = ~csum_fold((__force __wsum)htonl(partial_adj)); 123 + } else { 124 + *pcsum = 0; 125 + } 126 + 127 + *(pcsum + 1) = 0; 128 + *pcsum = gso_make_checksum(skb, 0); 115 129 } while ((skb = skb->next)); 116 130 out: 117 131 return segs;
+8 -2
net/ipv4/tcp_offload.c
··· 109 109 goto out; 110 110 } 111 111 112 + /* GSO partial only requires splitting the frame into an MSS 113 + * multiple and possibly a remainder. So update the mss now. 114 + */ 115 + if (features & NETIF_F_GSO_PARTIAL) 116 + mss = skb->len - (skb->len % mss); 117 + 112 118 copy_destructor = gso_skb->destructor == tcp_wfree; 113 119 ooo_okay = gso_skb->ooo_okay; 114 120 /* All segments but the first should have ooo_okay cleared */ ··· 139 133 newcheck = ~csum_fold((__force __wsum)((__force u32)th->check + 140 134 (__force u32)delta)); 141 135 142 - do { 136 + while (skb->next) { 143 137 th->fin = th->psh = 0; 144 138 th->check = newcheck; 145 139 ··· 159 153 160 154 th->seq = htonl(seq); 161 155 th->cwr = 0; 162 - } while (skb->next); 156 + } 163 157 164 158 /* Following permits TCP Small Queues to work well with GSO : 165 159 * The callback to TCP stack will be called at the time last frag
+21 -6
net/ipv4/udp_offload.c
··· 39 39 * 16 bit length field due to the header being added outside of an 40 40 * IP or IPv6 frame that was already limited to 64K - 1. 41 41 */ 42 - partial = csum_sub(csum_unfold(uh->check), 43 - (__force __wsum)htonl(skb->len)); 42 + if (skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL) 43 + partial = (__force __wsum)uh->len; 44 + else 45 + partial = (__force __wsum)htonl(skb->len); 46 + partial = csum_sub(csum_unfold(uh->check), partial); 44 47 45 48 /* setup inner skb. */ 46 49 skb->encapsulation = 0; ··· 92 89 udp_offset = outer_hlen - tnl_hlen; 93 90 skb = segs; 94 91 do { 95 - __be16 len; 92 + unsigned int len; 96 93 97 94 if (remcsum) 98 95 skb->ip_summed = CHECKSUM_NONE; ··· 110 107 skb_reset_mac_header(skb); 111 108 skb_set_network_header(skb, mac_len); 112 109 skb_set_transport_header(skb, udp_offset); 113 - len = htons(skb->len - udp_offset); 110 + len = skb->len - udp_offset; 114 111 uh = udp_hdr(skb); 115 - uh->len = len; 112 + 113 + /* If we are only performing partial GSO the inner header 114 + * will be using a length value equal to only one MSS sized 115 + * segment instead of the entire frame. 116 + */ 117 + if (skb_is_gso(skb)) { 118 + uh->len = htons(skb_shinfo(skb)->gso_size + 119 + SKB_GSO_CB(skb)->data_offset + 120 + skb->head - (unsigned char *)uh); 121 + } else { 122 + uh->len = htons(len); 123 + } 116 124 117 125 if (!need_csum) 118 126 continue; 119 127 120 - uh->check = ~csum_fold(csum_add(partial, (__force __wsum)len)); 128 + uh->check = ~csum_fold(csum_add(partial, 129 + (__force __wsum)htonl(len))); 121 130 122 131 if (skb->encapsulation || !offload_csum) { 123 132 uh->check = gso_make_checksum(skb, ~uh->check);
+9 -1
net/ipv6/ip6_offload.c
··· 63 63 int proto; 64 64 struct frag_hdr *fptr; 65 65 unsigned int unfrag_ip6hlen; 66 + unsigned int payload_len; 66 67 u8 *prevhdr; 67 68 int offset = 0; 68 69 bool encap, udpfrag; ··· 83 82 SKB_GSO_UDP_TUNNEL | 84 83 SKB_GSO_UDP_TUNNEL_CSUM | 85 84 SKB_GSO_TUNNEL_REMCSUM | 85 + SKB_GSO_PARTIAL | 86 86 0))) 87 87 goto out; 88 88 ··· 120 118 121 119 for (skb = segs; skb; skb = skb->next) { 122 120 ipv6h = (struct ipv6hdr *)(skb_mac_header(skb) + nhoff); 123 - ipv6h->payload_len = htons(skb->len - nhoff - sizeof(*ipv6h)); 121 + if (skb_is_gso(skb)) 122 + payload_len = skb_shinfo(skb)->gso_size + 123 + SKB_GSO_CB(skb)->data_offset + 124 + skb->head - (unsigned char *)(ipv6h + 1); 125 + else 126 + payload_len = skb->len - nhoff - sizeof(*ipv6h); 127 + ipv6h->payload_len = htons(payload_len); 124 128 skb->network_header = (u8 *)ipv6h - skb->head; 125 129 126 130 if (udpfrag) {