Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

hv_netvsc: Add support for LRO/RSC in the vSwitch

LRO/RSC in the vSwitch is a feature available in Windows Server 2019
hosts and later. It reduces the per packet processing overhead by
coalescing multiple TCP segments when possible. This patch adds netvsc
driver support for this feature.

Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

Haiyang Zhang and committed by
David S. Miller
c8e4eff4 bd4d08da

+145 -38
+40 -7
drivers/net/hyperv/hyperv_net.h
··· 186 186 /* Interface */ 187 187 struct rndis_message; 188 188 struct netvsc_device; 189 + struct netvsc_channel; 189 190 struct net_device_context; 190 191 191 192 extern u32 netvsc_ring_bytes; ··· 204 203 struct rndis_message *resp); 205 204 int netvsc_recv_callback(struct net_device *net, 206 205 struct netvsc_device *nvdev, 207 - struct vmbus_channel *channel, 208 - void *data, u32 len, 209 - const struct ndis_tcp_ip_checksum_info *csum_info, 210 - const struct ndis_pkt_8021q_info *vlan); 206 + struct netvsc_channel *nvchan); 211 207 void netvsc_channel_cb(void *context); 212 208 int netvsc_poll(struct napi_struct *napi, int budget); 213 209 ··· 220 222 const u8 *key); 221 223 int rndis_filter_receive(struct net_device *ndev, 222 224 struct netvsc_device *net_dev, 223 - struct vmbus_channel *channel, 225 + struct netvsc_channel *nvchan, 224 226 void *data, u32 buflen); 225 227 226 228 int rndis_filter_set_device_mac(struct netvsc_device *ndev, ··· 522 524 u64 ieee8021q:1; 523 525 u64 correlation_id:1; 524 526 u64 teaming:1; 527 + u64 vsubnetid:1; 528 + u64 rsc:1; 525 529 }; 526 530 }; 527 531 } __packed; ··· 826 826 827 827 #define NETVSC_SUPPORTED_HW_FEATURES (NETIF_F_RXCSUM | NETIF_F_IP_CSUM | \ 828 828 NETIF_F_TSO | NETIF_F_IPV6_CSUM | \ 829 - NETIF_F_TSO6) 829 + NETIF_F_TSO6 | NETIF_F_LRO) 830 830 831 831 #define VRSS_SEND_TAB_SIZE 16 /* must be power of 2 */ 832 832 #define VRSS_CHANNEL_MAX 64 ··· 850 850 struct recv_comp_data *slots; 851 851 u32 first; /* first data entry */ 852 852 u32 next; /* next entry for writing */ 853 + }; 854 + 855 + #define NVSP_RSC_MAX 562 /* Max #RSC frags in a vmbus xfer page pkt */ 856 + 857 + struct nvsc_rsc { 858 + const struct ndis_pkt_8021q_info *vlan; 859 + const struct ndis_tcp_ip_checksum_info *csum_info; 860 + u8 is_last; /* last RNDIS msg in a vmtransfer_page */ 861 + u32 cnt; /* #fragments in an RSC packet */ 862 + u32 pktlen; /* Full packet length */ 863 + void *data[NVSP_RSC_MAX]; 864 + u32 len[NVSP_RSC_MAX]; 853 865 }; 854 866 855 867 struct netvsc_stats { ··· 967 955 struct multi_send_data msd; 968 956 struct multi_recv_comp mrc; 969 957 atomic_t queue_sends; 958 + struct nvsc_rsc rsc; 970 959 971 960 struct netvsc_stats tx_stats; 972 961 struct netvsc_stats rx_stats; ··· 1149 1136 /* Packet extension field contents associated with a Data message. */ 1150 1137 struct rndis_per_packet_info { 1151 1138 u32 size; 1152 - u32 type; 1139 + u32 type:31; 1140 + u32 internal:1; 1153 1141 u32 ppi_offset; 1154 1142 }; 1155 1143 ··· 1169 1155 CACHED_NET_BUFLIST, 1170 1156 SHORT_PKT_PADINFO, 1171 1157 MAX_PER_PKT_INFO 1158 + }; 1159 + 1160 + enum rndis_per_pkt_info_interal_type { 1161 + RNDIS_PKTINFO_ID = 1, 1162 + /* Add more memebers here */ 1163 + 1164 + RNDIS_PKTINFO_MAX 1165 + }; 1166 + 1167 + #define RNDIS_PKTINFO_SUBALLOC BIT(0) 1168 + #define RNDIS_PKTINFO_1ST_FRAG BIT(1) 1169 + #define RNDIS_PKTINFO_LAST_FRAG BIT(2) 1170 + 1171 + #define RNDIS_PKTINFO_ID_V1 1 1172 + 1173 + struct rndis_pktinfo_id { 1174 + u8 ver; 1175 + u8 flag; 1176 + u16 pkt_id; 1172 1177 }; 1173 1178 1174 1179 struct ndis_pkt_8021q_info {
+13 -5
drivers/net/hyperv/netvsc.c
··· 542 542 init_packet->msg.v2_msg.send_ndis_config.capability.teaming = 1; 543 543 } 544 544 545 + if (nvsp_ver >= NVSP_PROTOCOL_VERSION_61) 546 + init_packet->msg.v2_msg.send_ndis_config.capability.rsc = 1; 547 + 545 548 trace_nvsp_send(ndev, init_packet); 546 549 547 550 ret = vmbus_sendpacket(device->channel, init_packet, ··· 1114 1111 1115 1112 static int netvsc_receive(struct net_device *ndev, 1116 1113 struct netvsc_device *net_device, 1117 - struct vmbus_channel *channel, 1114 + struct netvsc_channel *nvchan, 1118 1115 const struct vmpacket_descriptor *desc, 1119 1116 const struct nvsp_message *nvsp) 1120 1117 { 1121 1118 struct net_device_context *net_device_ctx = netdev_priv(ndev); 1119 + struct vmbus_channel *channel = nvchan->channel; 1122 1120 const struct vmtransfer_page_packet_header *vmxferpage_packet 1123 1121 = container_of(desc, const struct vmtransfer_page_packet_header, d); 1124 1122 u16 q_idx = channel->offermsg.offer.sub_channel_index; ··· 1154 1150 int ret; 1155 1151 1156 1152 if (unlikely(offset + buflen > net_device->recv_buf_size)) { 1153 + nvchan->rsc.cnt = 0; 1157 1154 status = NVSP_STAT_FAIL; 1158 1155 netif_err(net_device_ctx, rx_err, ndev, 1159 1156 "Packet offset:%u + len:%u too big\n", ··· 1165 1160 1166 1161 data = recv_buf + offset; 1167 1162 1163 + nvchan->rsc.is_last = (i == count - 1); 1164 + 1168 1165 trace_rndis_recv(ndev, q_idx, data); 1169 1166 1170 1167 /* Pass it to the upper layer */ 1171 1168 ret = rndis_filter_receive(ndev, net_device, 1172 - channel, data, buflen); 1169 + nvchan, data, buflen); 1173 1170 1174 1171 if (unlikely(ret != NVSP_STAT_SUCCESS)) 1175 1172 status = NVSP_STAT_FAIL; ··· 1230 1223 } 1231 1224 1232 1225 static int netvsc_process_raw_pkt(struct hv_device *device, 1233 - struct vmbus_channel *channel, 1226 + struct netvsc_channel *nvchan, 1234 1227 struct netvsc_device *net_device, 1235 1228 struct net_device *ndev, 1236 1229 const struct vmpacket_descriptor *desc, 1237 1230 int budget) 1238 1231 { 1232 + struct vmbus_channel *channel = nvchan->channel; 1239 1233 const struct nvsp_message *nvmsg = hv_pkt_data(desc); 1240 1234 1241 1235 trace_nvsp_recv(ndev, channel, nvmsg); ··· 1248 1240 break; 1249 1241 1250 1242 case VM_PKT_DATA_USING_XFER_PAGES: 1251 - return netvsc_receive(ndev, net_device, channel, 1243 + return netvsc_receive(ndev, net_device, nvchan, 1252 1244 desc, nvmsg); 1253 1245 break; 1254 1246 ··· 1292 1284 nvchan->desc = hv_pkt_iter_first(channel); 1293 1285 1294 1286 while (nvchan->desc && work_done < budget) { 1295 - work_done += netvsc_process_raw_pkt(device, channel, net_device, 1287 + work_done += netvsc_process_raw_pkt(device, nvchan, net_device, 1296 1288 ndev, nvchan->desc, budget); 1297 1289 nvchan->desc = hv_pkt_iter_next(channel, nvchan->desc); 1298 1290 }
+14 -14
drivers/net/hyperv/netvsc_drv.c
··· 744 744 } 745 745 746 746 static struct sk_buff *netvsc_alloc_recv_skb(struct net_device *net, 747 - struct napi_struct *napi, 748 - const struct ndis_tcp_ip_checksum_info *csum_info, 749 - const struct ndis_pkt_8021q_info *vlan, 750 - void *data, u32 buflen) 747 + struct netvsc_channel *nvchan) 751 748 { 749 + struct napi_struct *napi = &nvchan->napi; 750 + const struct ndis_pkt_8021q_info *vlan = nvchan->rsc.vlan; 751 + const struct ndis_tcp_ip_checksum_info *csum_info = 752 + nvchan->rsc.csum_info; 752 753 struct sk_buff *skb; 754 + int i; 753 755 754 - skb = napi_alloc_skb(napi, buflen); 756 + skb = napi_alloc_skb(napi, nvchan->rsc.pktlen); 755 757 if (!skb) 756 758 return skb; 757 759 ··· 761 759 * Copy to skb. This copy is needed here since the memory pointed by 762 760 * hv_netvsc_packet cannot be deallocated 763 761 */ 764 - skb_put_data(skb, data, buflen); 762 + for (i = 0; i < nvchan->rsc.cnt; i++) 763 + skb_put_data(skb, nvchan->rsc.data[i], nvchan->rsc.len[i]); 765 764 766 765 skb->protocol = eth_type_trans(skb, net); 767 766 ··· 795 792 */ 796 793 int netvsc_recv_callback(struct net_device *net, 797 794 struct netvsc_device *net_device, 798 - struct vmbus_channel *channel, 799 - void *data, u32 len, 800 - const struct ndis_tcp_ip_checksum_info *csum_info, 801 - const struct ndis_pkt_8021q_info *vlan) 795 + struct netvsc_channel *nvchan) 802 796 { 803 797 struct net_device_context *net_device_ctx = netdev_priv(net); 798 + struct vmbus_channel *channel = nvchan->channel; 804 799 u16 q_idx = channel->offermsg.offer.sub_channel_index; 805 - struct netvsc_channel *nvchan = &net_device->chan_table[q_idx]; 806 800 struct sk_buff *skb; 807 801 struct netvsc_stats *rx_stats; 808 802 ··· 807 807 return NVSP_STAT_FAIL; 808 808 809 809 /* Allocate a skb - TODO direct I/O to pages? */ 810 - skb = netvsc_alloc_recv_skb(net, &nvchan->napi, 811 - csum_info, vlan, data, len); 810 + skb = netvsc_alloc_recv_skb(net, nvchan); 811 + 812 812 if (unlikely(!skb)) { 813 813 ++net_device_ctx->eth_stats.rx_no_memory; 814 814 rcu_read_unlock(); ··· 825 825 rx_stats = &nvchan->rx_stats; 826 826 u64_stats_update_begin(&rx_stats->syncp); 827 827 rx_stats->packets++; 828 - rx_stats->bytes += len; 828 + rx_stats->bytes += nvchan->rsc.pktlen; 829 829 830 830 if (skb->pkt_type == PACKET_BROADCAST) 831 831 ++rx_stats->broadcast;
+78 -12
drivers/net/hyperv/rndis_filter.c
··· 342 342 * Get the Per-Packet-Info with the specified type 343 343 * return NULL if not found. 344 344 */ 345 - static inline void *rndis_get_ppi(struct rndis_packet *rpkt, u32 type) 345 + static inline void *rndis_get_ppi(struct rndis_packet *rpkt, 346 + u32 type, u8 internal) 346 347 { 347 348 struct rndis_per_packet_info *ppi; 348 349 int len; ··· 356 355 len = rpkt->per_pkt_info_len; 357 356 358 357 while (len > 0) { 359 - if (ppi->type == type) 358 + if (ppi->type == type && ppi->internal == internal) 360 359 return (void *)((ulong)ppi + ppi->ppi_offset); 361 360 len -= ppi->size; 362 361 ppi = (struct rndis_per_packet_info *)((ulong)ppi + ppi->size); ··· 365 364 return NULL; 366 365 } 367 366 367 + static inline 368 + void rsc_add_data(struct netvsc_channel *nvchan, 369 + const struct ndis_pkt_8021q_info *vlan, 370 + const struct ndis_tcp_ip_checksum_info *csum_info, 371 + void *data, u32 len) 372 + { 373 + u32 cnt = nvchan->rsc.cnt; 374 + 375 + if (cnt) { 376 + nvchan->rsc.pktlen += len; 377 + } else { 378 + nvchan->rsc.vlan = vlan; 379 + nvchan->rsc.csum_info = csum_info; 380 + nvchan->rsc.pktlen = len; 381 + } 382 + 383 + nvchan->rsc.data[cnt] = data; 384 + nvchan->rsc.len[cnt] = len; 385 + nvchan->rsc.cnt++; 386 + } 387 + 368 388 static int rndis_filter_receive_data(struct net_device *ndev, 369 389 struct netvsc_device *nvdev, 370 - struct vmbus_channel *channel, 390 + struct netvsc_channel *nvchan, 371 391 struct rndis_message *msg, 372 392 u32 data_buflen) 373 393 { 374 394 struct rndis_packet *rndis_pkt = &msg->msg.pkt; 375 395 const struct ndis_tcp_ip_checksum_info *csum_info; 376 396 const struct ndis_pkt_8021q_info *vlan; 397 + const struct rndis_pktinfo_id *pktinfo_id; 377 398 u32 data_offset; 378 399 void *data; 400 + bool rsc_more = false; 401 + int ret; 379 402 380 403 /* Remove the rndis header and pass it back up the stack */ 381 404 data_offset = RNDIS_HEADER_SIZE + rndis_pkt->data_offset; ··· 418 393 return NVSP_STAT_FAIL; 419 394 } 420 395 421 - vlan = rndis_get_ppi(rndis_pkt, IEEE_8021Q_INFO); 396 + vlan = rndis_get_ppi(rndis_pkt, IEEE_8021Q_INFO, 0); 422 397 423 - csum_info = rndis_get_ppi(rndis_pkt, TCPIP_CHKSUM_PKTINFO); 398 + csum_info = rndis_get_ppi(rndis_pkt, TCPIP_CHKSUM_PKTINFO, 0); 399 + 400 + pktinfo_id = rndis_get_ppi(rndis_pkt, RNDIS_PKTINFO_ID, 1); 424 401 425 402 data = (void *)msg + data_offset; 426 403 427 - /* 428 - * Remove the rndis trailer padding from rndis packet message 404 + /* Identify RSC frags, drop erroneous packets */ 405 + if (pktinfo_id && (pktinfo_id->flag & RNDIS_PKTINFO_SUBALLOC)) { 406 + if (pktinfo_id->flag & RNDIS_PKTINFO_1ST_FRAG) 407 + nvchan->rsc.cnt = 0; 408 + else if (nvchan->rsc.cnt == 0) 409 + goto drop; 410 + 411 + rsc_more = true; 412 + 413 + if (pktinfo_id->flag & RNDIS_PKTINFO_LAST_FRAG) 414 + rsc_more = false; 415 + 416 + if (rsc_more && nvchan->rsc.is_last) 417 + goto drop; 418 + } else { 419 + nvchan->rsc.cnt = 0; 420 + } 421 + 422 + if (unlikely(nvchan->rsc.cnt >= NVSP_RSC_MAX)) 423 + goto drop; 424 + 425 + /* Put data into per channel structure. 426 + * Also, remove the rndis trailer padding from rndis packet message 429 427 * rndis_pkt->data_len tell us the real data length, we only copy 430 428 * the data packet to the stack, without the rndis trailer padding 431 429 */ 432 - return netvsc_recv_callback(ndev, nvdev, channel, 433 - data, rndis_pkt->data_len, 434 - csum_info, vlan); 430 + rsc_add_data(nvchan, vlan, csum_info, data, rndis_pkt->data_len); 431 + 432 + if (rsc_more) 433 + return NVSP_STAT_SUCCESS; 434 + 435 + ret = netvsc_recv_callback(ndev, nvdev, nvchan); 436 + nvchan->rsc.cnt = 0; 437 + 438 + return ret; 439 + 440 + drop: 441 + /* Drop incomplete packet */ 442 + nvchan->rsc.cnt = 0; 443 + return NVSP_STAT_FAIL; 435 444 } 436 445 437 446 int rndis_filter_receive(struct net_device *ndev, 438 447 struct netvsc_device *net_dev, 439 - struct vmbus_channel *channel, 448 + struct netvsc_channel *nvchan, 440 449 void *data, u32 buflen) 441 450 { 442 451 struct net_device_context *net_device_ctx = netdev_priv(ndev); ··· 481 422 482 423 switch (rndis_msg->ndis_msg_type) { 483 424 case RNDIS_MSG_PACKET: 484 - return rndis_filter_receive_data(ndev, net_dev, channel, 425 + return rndis_filter_receive_data(ndev, net_dev, nvchan, 485 426 rndis_msg, buflen); 486 427 case RNDIS_MSG_INIT_C: 487 428 case RNDIS_MSG_QUERY_C: ··· 1241 1182 offloads.udp_ip_v6_csum = NDIS_OFFLOAD_PARAMETERS_TX_RX_ENABLED; 1242 1183 net_device_ctx->tx_checksum_mask |= TRANSPORT_INFO_IPV6_UDP; 1243 1184 } 1185 + } 1186 + 1187 + if (hwcaps.rsc.ip4 && hwcaps.rsc.ip6) { 1188 + net->hw_features |= NETIF_F_LRO; 1189 + 1190 + offloads.rsc_ip_v4 = NDIS_OFFLOAD_PARAMETERS_RSC_ENABLED; 1191 + offloads.rsc_ip_v6 = NDIS_OFFLOAD_PARAMETERS_RSC_ENABLED; 1244 1192 } 1245 1193 1246 1194 /* In case some hw_features disappeared we need to remove them from