Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

vmxnet3: add geneve and vxlan tunnel offload support

Vmxnet3 version 3 device supports checksum/TSO offload. Thus, vNIC to
pNIC traffic can leverage hardware checksum/TSO offloads. However,
vmxnet3 does not support checksum/TSO offload for Geneve/VXLAN
encapsulated packets. Thus, for a vNIC configured with an overlay, the
guest stack must first segment the inner packet, compute the inner
checksum for each segment and encapsulate each segment before
transmitting the packet via the vNIC. This results in significant
performance penalty.

This patch will enhance vmxnet3 to support Geneve/VXLAN TSO as well as
checksum offload.

Signed-off-by: Ronak Doshi <doshir@vmware.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

Ronak Doshi and committed by
David S. Miller
dacce2be d3a8a9e5

+161 -33
+3
drivers/net/vmxnet3/upt1_defs.h
··· 92 92 UPT1_F_RSS = cpu_to_le64(0x0002), 93 93 UPT1_F_RXVLAN = cpu_to_le64(0x0004), /* VLAN tag stripping */ 94 94 UPT1_F_LRO = cpu_to_le64(0x0008), 95 + UPT1_F_RXINNEROFLD = cpu_to_le64(0x00010), /* Geneve/Vxlan rx csum 96 + * offloading 97 + */ 95 98 }; 96 99 #endif
+10 -7
drivers/net/vmxnet3/vmxnet3_defs.h
··· 103 103 /* 104 104 * Little Endian layout of bitfields - 105 105 * Byte 0 : 7.....len.....0 106 - * Byte 1 : rsvd gen 13.len.8 106 + * Byte 1 : oco gen 13.len.8 107 107 * Byte 2 : 5.msscof.0 ext1 dtype 108 108 * Byte 3 : 13...msscof...6 109 109 * 110 110 * Big Endian layout of bitfields - 111 111 * Byte 0: 13...msscof...6 112 112 * Byte 1 : 5.msscof.0 ext1 dtype 113 - * Byte 2 : rsvd gen 13.len.8 113 + * Byte 2 : oco gen 13.len.8 114 114 * Byte 3 : 7.....len.....0 115 115 * 116 116 * Thus, le32_to_cpu on the dword will allow the big endian driver to read ··· 125 125 u32 msscof:14; /* MSS, checksum offset, flags */ 126 126 u32 ext1:1; 127 127 u32 dtype:1; /* descriptor type */ 128 - u32 rsvd:1; 128 + u32 oco:1; 129 129 u32 gen:1; /* generation bit */ 130 130 u32 len:14; 131 131 #else 132 132 u32 len:14; 133 133 u32 gen:1; /* generation bit */ 134 - u32 rsvd:1; 134 + u32 oco:1; 135 135 u32 dtype:1; /* descriptor type */ 136 136 u32 ext1:1; 137 137 u32 msscof:14; /* MSS, checksum offset, flags */ ··· 157 157 }; 158 158 159 159 /* TxDesc.OM values */ 160 - #define VMXNET3_OM_NONE 0 161 - #define VMXNET3_OM_CSUM 2 162 - #define VMXNET3_OM_TSO 3 160 + #define VMXNET3_OM_NONE 0 161 + #define VMXNET3_OM_ENCAP 1 162 + #define VMXNET3_OM_CSUM 2 163 + #define VMXNET3_OM_TSO 3 163 164 164 165 /* fields in TxDesc we access w/o using bit fields */ 165 166 #define VMXNET3_TXD_EOP_SHIFT 12 ··· 226 225 /* fields in RxDesc we access w/o using bit fields */ 227 226 #define VMXNET3_RXD_BTYPE_SHIFT 14 228 227 #define VMXNET3_RXD_GEN_SHIFT 31 228 + 229 + #define VMXNET3_RCD_HDR_INNER_SHIFT 13 229 230 230 231 struct Vmxnet3_RxCompDesc { 231 232 #ifdef __BIG_ENDIAN_BITFIELD
+98 -22
drivers/net/vmxnet3/vmxnet3_drv.c
··· 842 842 u8 protocol = 0; 843 843 844 844 if (ctx->mss) { /* TSO */ 845 - ctx->eth_ip_hdr_size = skb_transport_offset(skb); 846 - ctx->l4_hdr_size = tcp_hdrlen(skb); 847 - ctx->copy_size = ctx->eth_ip_hdr_size + ctx->l4_hdr_size; 845 + if (VMXNET3_VERSION_GE_4(adapter) && skb->encapsulation) { 846 + ctx->l4_offset = skb_inner_transport_offset(skb); 847 + ctx->l4_hdr_size = inner_tcp_hdrlen(skb); 848 + ctx->copy_size = ctx->l4_offset + ctx->l4_hdr_size; 849 + } else { 850 + ctx->l4_offset = skb_transport_offset(skb); 851 + ctx->l4_hdr_size = tcp_hdrlen(skb); 852 + ctx->copy_size = ctx->l4_offset + ctx->l4_hdr_size; 853 + } 848 854 } else { 849 855 if (skb->ip_summed == CHECKSUM_PARTIAL) { 850 - ctx->eth_ip_hdr_size = skb_checksum_start_offset(skb); 856 + /* For encap packets, skb_checksum_start_offset refers 857 + * to inner L4 offset. Thus, below works for encap as 858 + * well as non-encap case 859 + */ 860 + ctx->l4_offset = skb_checksum_start_offset(skb); 851 861 852 862 if (ctx->ipv4) { 853 863 const struct iphdr *iph = ip_hdr(skb); ··· 881 871 break; 882 872 } 883 873 884 - ctx->copy_size = min(ctx->eth_ip_hdr_size + 874 + ctx->copy_size = min(ctx->l4_offset + 885 875 ctx->l4_hdr_size, skb->len); 886 876 } else { 887 - ctx->eth_ip_hdr_size = 0; 877 + ctx->l4_offset = 0; 888 878 ctx->l4_hdr_size = 0; 889 879 /* copy as much as allowed */ 890 880 ctx->copy_size = min_t(unsigned int, ··· 938 928 ctx->copy_size, tq->tx_ring.next2fill); 939 929 } 940 930 931 + 932 + static void 933 + vmxnet3_prepare_inner_tso(struct sk_buff *skb, 934 + struct vmxnet3_tx_ctx *ctx) 935 + { 936 + struct tcphdr *tcph = inner_tcp_hdr(skb); 937 + struct iphdr *iph = inner_ip_hdr(skb); 938 + 939 + if (ctx->ipv4) { 940 + iph->check = 0; 941 + tcph->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, 0, 942 + IPPROTO_TCP, 0); 943 + } else if (ctx->ipv6) { 944 + struct ipv6hdr *iph = inner_ipv6_hdr(skb); 945 + 946 + tcph->check = ~csum_ipv6_magic(&iph->saddr, &iph->daddr, 0, 947 + IPPROTO_TCP, 0); 948 + } 949 + } 941 950 942 951 static void 943 952 vmxnet3_prepare_tso(struct sk_buff *skb, ··· 1016 987 /* Use temporary descriptor to avoid touching bits multiple times */ 1017 988 union Vmxnet3_GenericDesc tempTxDesc; 1018 989 #endif 990 + struct udphdr *udph; 1019 991 1020 992 count = txd_estimate(skb); 1021 993 ··· 1033 1003 } 1034 1004 tq->stats.copy_skb_header++; 1035 1005 } 1036 - vmxnet3_prepare_tso(skb, &ctx); 1006 + if (skb->encapsulation) { 1007 + vmxnet3_prepare_inner_tso(skb, &ctx); 1008 + } else { 1009 + vmxnet3_prepare_tso(skb, &ctx); 1010 + } 1037 1011 } else { 1038 1012 if (unlikely(count > VMXNET3_MAX_TXD_PER_PKT)) { 1039 1013 ··· 1060 1026 BUG_ON(ret <= 0 && ctx.copy_size != 0); 1061 1027 /* hdrs parsed, check against other limits */ 1062 1028 if (ctx.mss) { 1063 - if (unlikely(ctx.eth_ip_hdr_size + ctx.l4_hdr_size > 1029 + if (unlikely(ctx.l4_offset + ctx.l4_hdr_size > 1064 1030 VMXNET3_MAX_TX_BUF_SIZE)) { 1065 1031 tq->stats.drop_oversized_hdr++; 1066 1032 goto drop_pkt; 1067 1033 } 1068 1034 } else { 1069 1035 if (skb->ip_summed == CHECKSUM_PARTIAL) { 1070 - if (unlikely(ctx.eth_ip_hdr_size + 1036 + if (unlikely(ctx.l4_offset + 1071 1037 skb->csum_offset > 1072 1038 VMXNET3_MAX_CSUM_OFFSET)) { 1073 1039 tq->stats.drop_oversized_hdr++; ··· 1114 1080 #endif 1115 1081 tx_num_deferred = le32_to_cpu(tq->shared->txNumDeferred); 1116 1082 if (ctx.mss) { 1117 - gdesc->txd.hlen = ctx.eth_ip_hdr_size + ctx.l4_hdr_size; 1118 - gdesc->txd.om = VMXNET3_OM_TSO; 1119 - gdesc->txd.msscof = ctx.mss; 1083 + if (VMXNET3_VERSION_GE_4(adapter) && skb->encapsulation) { 1084 + gdesc->txd.hlen = ctx.l4_offset + ctx.l4_hdr_size; 1085 + gdesc->txd.om = VMXNET3_OM_ENCAP; 1086 + gdesc->txd.msscof = ctx.mss; 1087 + 1088 + udph = udp_hdr(skb); 1089 + if (udph->check) 1090 + gdesc->txd.oco = 1; 1091 + } else { 1092 + gdesc->txd.hlen = ctx.l4_offset + ctx.l4_hdr_size; 1093 + gdesc->txd.om = VMXNET3_OM_TSO; 1094 + gdesc->txd.msscof = ctx.mss; 1095 + } 1120 1096 num_pkts = (skb->len - gdesc->txd.hlen + ctx.mss - 1) / ctx.mss; 1121 1097 } else { 1122 1098 if (skb->ip_summed == CHECKSUM_PARTIAL) { 1123 - gdesc->txd.hlen = ctx.eth_ip_hdr_size; 1124 - gdesc->txd.om = VMXNET3_OM_CSUM; 1125 - gdesc->txd.msscof = ctx.eth_ip_hdr_size + 1126 - skb->csum_offset; 1099 + if (VMXNET3_VERSION_GE_4(adapter) && 1100 + skb->encapsulation) { 1101 + gdesc->txd.hlen = ctx.l4_offset + 1102 + ctx.l4_hdr_size; 1103 + gdesc->txd.om = VMXNET3_OM_ENCAP; 1104 + gdesc->txd.msscof = 0; /* Reserved */ 1105 + } else { 1106 + gdesc->txd.hlen = ctx.l4_offset; 1107 + gdesc->txd.om = VMXNET3_OM_CSUM; 1108 + gdesc->txd.msscof = ctx.l4_offset + 1109 + skb->csum_offset; 1110 + } 1127 1111 } else { 1128 1112 gdesc->txd.om = 0; 1129 1113 gdesc->txd.msscof = 0; ··· 1220 1168 (le32_to_cpu(gdesc->dword[3]) & 1221 1169 VMXNET3_RCD_CSUM_OK) == VMXNET3_RCD_CSUM_OK) { 1222 1170 skb->ip_summed = CHECKSUM_UNNECESSARY; 1223 - BUG_ON(!(gdesc->rcd.tcp || gdesc->rcd.udp)); 1224 - BUG_ON(gdesc->rcd.frg); 1171 + WARN_ON_ONCE(!(gdesc->rcd.tcp || gdesc->rcd.udp) && 1172 + !(le32_to_cpu(gdesc->dword[0]) & 1173 + (1UL << VMXNET3_RCD_HDR_INNER_SHIFT))); 1174 + WARN_ON_ONCE(gdesc->rcd.frg && 1175 + !(le32_to_cpu(gdesc->dword[0]) & 1176 + (1UL << VMXNET3_RCD_HDR_INNER_SHIFT))); 1225 1177 } else if (gdesc->rcd.v6 && (le32_to_cpu(gdesc->dword[3]) & 1226 1178 (1 << VMXNET3_RCD_TUC_SHIFT))) { 1227 1179 skb->ip_summed = CHECKSUM_UNNECESSARY; 1228 - BUG_ON(!(gdesc->rcd.tcp || gdesc->rcd.udp)); 1229 - BUG_ON(gdesc->rcd.frg); 1180 + WARN_ON_ONCE(!(gdesc->rcd.tcp || gdesc->rcd.udp) && 1181 + !(le32_to_cpu(gdesc->dword[0]) & 1182 + (1UL << VMXNET3_RCD_HDR_INNER_SHIFT))); 1183 + WARN_ON_ONCE(gdesc->rcd.frg && 1184 + !(le32_to_cpu(gdesc->dword[0]) & 1185 + (1UL << VMXNET3_RCD_HDR_INNER_SHIFT))); 1230 1186 } else { 1231 1187 if (gdesc->rcd.csum) { 1232 1188 skb->csum = htons(gdesc->rcd.csum); ··· 2489 2429 if (adapter->netdev->features & NETIF_F_HW_VLAN_CTAG_RX) 2490 2430 devRead->misc.uptFeatures |= UPT1_F_RXVLAN; 2491 2431 2432 + if (adapter->netdev->features & (NETIF_F_GSO_UDP_TUNNEL | 2433 + NETIF_F_GSO_UDP_TUNNEL_CSUM)) 2434 + devRead->misc.uptFeatures |= UPT1_F_RXINNEROFLD; 2435 + 2492 2436 devRead->misc.mtu = cpu_to_le32(adapter->netdev->mtu); 2493 2437 devRead->misc.queueDescPA = cpu_to_le64(adapter->queue_desc_pa); 2494 2438 devRead->misc.queueDescLen = cpu_to_le32( ··· 2625 2561 union Vmxnet3_CmdInfo *cmdInfo = &shared->cu.cmdInfo; 2626 2562 unsigned long flags; 2627 2563 2628 - if (!VMXNET3_VERSION_GE_4(adapter)) 2629 - return; 2564 + if (!VMXNET3_VERSION_GE_4(adapter)) 2565 + return; 2630 2566 2631 2567 spin_lock_irqsave(&adapter->cmd_lock, flags); 2632 2568 ··· 3137 3073 NETIF_F_HW_CSUM | NETIF_F_HW_VLAN_CTAG_TX | 3138 3074 NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_TSO | NETIF_F_TSO6 | 3139 3075 NETIF_F_LRO; 3076 + 3077 + if (VMXNET3_VERSION_GE_4(adapter)) { 3078 + netdev->hw_features |= NETIF_F_GSO_UDP_TUNNEL | 3079 + NETIF_F_GSO_UDP_TUNNEL_CSUM; 3080 + 3081 + netdev->hw_enc_features = NETIF_F_SG | NETIF_F_RXCSUM | 3082 + NETIF_F_HW_CSUM | NETIF_F_HW_VLAN_CTAG_TX | 3083 + NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_TSO | NETIF_F_TSO6 | 3084 + NETIF_F_LRO | NETIF_F_GSO_UDP_TUNNEL | 3085 + NETIF_F_GSO_UDP_TUNNEL_CSUM; 3086 + } 3087 + 3140 3088 if (dma64) 3141 3089 netdev->hw_features |= NETIF_F_HIGHDMA; 3142 3090 netdev->vlan_features = netdev->hw_features &
+41 -1
drivers/net/vmxnet3/vmxnet3_ethtool.c
··· 267 267 return features; 268 268 } 269 269 270 + static void vmxnet3_enable_encap_offloads(struct net_device *netdev) 271 + { 272 + struct vmxnet3_adapter *adapter = netdev_priv(netdev); 273 + 274 + if (VMXNET3_VERSION_GE_4(adapter)) { 275 + netdev->hw_enc_features |= NETIF_F_SG | NETIF_F_RXCSUM | 276 + NETIF_F_HW_CSUM | NETIF_F_HW_VLAN_CTAG_TX | 277 + NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_TSO | NETIF_F_TSO6 | 278 + NETIF_F_LRO | NETIF_F_GSO_UDP_TUNNEL | 279 + NETIF_F_GSO_UDP_TUNNEL_CSUM; 280 + } 281 + } 282 + 283 + static void vmxnet3_disable_encap_offloads(struct net_device *netdev) 284 + { 285 + struct vmxnet3_adapter *adapter = netdev_priv(netdev); 286 + 287 + if (VMXNET3_VERSION_GE_4(adapter)) { 288 + netdev->hw_enc_features &= ~(NETIF_F_SG | NETIF_F_RXCSUM | 289 + NETIF_F_HW_CSUM | NETIF_F_HW_VLAN_CTAG_TX | 290 + NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_TSO | NETIF_F_TSO6 | 291 + NETIF_F_LRO | NETIF_F_GSO_UDP_TUNNEL | 292 + NETIF_F_GSO_UDP_TUNNEL_CSUM); 293 + } 294 + } 295 + 270 296 int vmxnet3_set_features(struct net_device *netdev, netdev_features_t features) 271 297 { 272 298 struct vmxnet3_adapter *adapter = netdev_priv(netdev); 273 299 unsigned long flags; 274 300 netdev_features_t changed = features ^ netdev->features; 301 + netdev_features_t tun_offload_mask = NETIF_F_GSO_UDP_TUNNEL | 302 + NETIF_F_GSO_UDP_TUNNEL_CSUM; 303 + u8 udp_tun_enabled = (netdev->features & tun_offload_mask) != 0; 275 304 276 305 if (changed & (NETIF_F_RXCSUM | NETIF_F_LRO | 277 - NETIF_F_HW_VLAN_CTAG_RX)) { 306 + NETIF_F_HW_VLAN_CTAG_RX | tun_offload_mask)) { 278 307 if (features & NETIF_F_RXCSUM) 279 308 adapter->shared->devRead.misc.uptFeatures |= 280 309 UPT1_F_RXCSUM; ··· 325 296 else 326 297 adapter->shared->devRead.misc.uptFeatures &= 327 298 ~UPT1_F_RXVLAN; 299 + 300 + if ((features & tun_offload_mask) != 0 && !udp_tun_enabled) { 301 + vmxnet3_enable_encap_offloads(netdev); 302 + adapter->shared->devRead.misc.uptFeatures |= 303 + UPT1_F_RXINNEROFLD; 304 + } else if ((features & tun_offload_mask) == 0 && 305 + udp_tun_enabled) { 306 + vmxnet3_disable_encap_offloads(netdev); 307 + adapter->shared->devRead.misc.uptFeatures &= 308 + ~UPT1_F_RXINNEROFLD; 309 + } 328 310 329 311 spin_lock_irqsave(&adapter->cmd_lock, flags); 330 312 VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_CMD,
+9 -3
drivers/net/vmxnet3/vmxnet3_int.h
··· 219 219 bool ipv4; 220 220 bool ipv6; 221 221 u16 mss; 222 - u32 eth_ip_hdr_size; /* only valid for pkts requesting tso or csum 223 - * offloading 222 + u32 l4_offset; /* only valid for pkts requesting tso or csum 223 + * offloading. For encap offload, it refers to 224 + * inner L4 offset i.e. it includes outer header 225 + * encap header and inner eth and ip header size 224 226 */ 225 - u32 l4_hdr_size; /* only valid if mss != 0 */ 227 + 228 + u32 l4_hdr_size; /* only valid if mss != 0 229 + * Refers to inner L4 hdr size for encap 230 + * offload 231 + */ 226 232 u32 copy_size; /* # of bytes copied into the data ring */ 227 233 union Vmxnet3_GenericDesc *sop_txd; 228 234 union Vmxnet3_GenericDesc *eop_txd;