Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

udp: generate gso with UDP_SEGMENT

Support generic segmentation offload for udp datagrams. Callers can
concatenate and send at once the payload of multiple datagrams with
the same destination.

To set segment size, the caller sets socket option UDP_SEGMENT to the
length of each discrete payload. This value must be smaller than or
equal to the relevant MTU.

A follow-up patch adds cmsg UDP_SEGMENT to specify segment size on a
per send call basis.

Total byte length may then exceed MTU. If not an exact multiple of
segment size, the last segment will be shorter.

The implementation adds a gso_size field to the udp socket, ip(v6)
cmsg cookie and inet_cork structure to be able to set the value at
setsockopt or cmsg time and to work with both lockless and corked
paths.

Initial benchmark numbers show UDP GSO about as expensive as TCP GSO.

tcp tso
3197 MB/s 54232 msg/s 54232 calls/s
6,457,754,262 cycles

tcp gso
1765 MB/s 29939 msg/s 29939 calls/s
11,203,021,806 cycles

tcp without tso/gso *
739 MB/s 12548 msg/s 12548 calls/s
11,205,483,630 cycles

udp
876 MB/s 14873 msg/s 624666 calls/s
11,205,777,429 cycles

udp gso
2139 MB/s 36282 msg/s 36282 calls/s
11,204,374,561 cycles

[*] after reverting commit 0a6b2a1dc2a2
("tcp: switch to GSO being always on")

Measured total system cycles ('-a') for one core while pinning both
the network receive path and benchmark process to that core:

perf stat -a -C 12 -e cycles \
./udpgso_bench_tx -C 12 -4 -D "$DST" -l 4

Note the reduction in calls/s with GSO. Bytes per syscall drops
increases from 1470 to 61818.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

Willem de Bruijn and committed by
David S. Miller
bec1f6f6 ee80d1eb

+67 -11
+3
include/linux/udp.h
··· 55 55 * when the socket is uncorked. 56 56 */ 57 57 __u16 len; /* total length of pending frames */ 58 + __u16 gso_size; 58 59 /* 59 60 * Fields specific to UDP-Lite. 60 61 */ ··· 87 86 /* This field is dirtied by udp_recvmsg() */ 88 87 int forward_deficit; 89 88 }; 89 + 90 + #define UDP_MAX_SEGMENTS (1 << 6UL) 90 91 91 92 static inline struct udp_sock *udp_sk(const struct sock *sk) 92 93 {
+1
include/net/inet_sock.h
··· 147 147 __u8 ttl; 148 148 __s16 tos; 149 149 char priority; 150 + __u16 gso_size; 150 151 }; 151 152 152 153 struct inet_cork_full {
+1
include/net/ip.h
··· 76 76 __u8 ttl; 77 77 __s16 tos; 78 78 char priority; 79 + __u16 gso_size; 79 80 }; 80 81 81 82 #define IPCB(skb) ((struct inet_skb_parm*)((skb)->cb))
+1
include/net/ipv6.h
··· 298 298 __s16 tclass; 299 299 __s8 dontfrag; 300 300 struct ipv6_txoptions *opt; 301 + __u16 gso_size; 301 302 }; 302 303 303 304 static inline struct ipv6_txoptions *txopt_get(const struct ipv6_pinfo *np)
+1
include/uapi/linux/udp.h
··· 32 32 #define UDP_ENCAP 100 /* Set the socket to accept encapsulated packets */ 33 33 #define UDP_NO_CHECK6_TX 101 /* Disable sending checksum for UDP6X */ 34 34 #define UDP_NO_CHECK6_RX 102 /* Disable accpeting checksum for UDP6 */ 35 + #define UDP_SEGMENT 103 /* Set GSO segmentation size */ 35 36 36 37 /* UDP encapsulation types */ 37 38 #define UDP_ENCAP_ESPINUDP_NON_IKE 1 /* draft-ietf-ipsec-nat-t-ike-00/01 */
+6 -3
net/ipv4/ip_output.c
··· 882 882 skb = skb_peek_tail(queue); 883 883 884 884 exthdrlen = !skb ? rt->dst.header_len : 0; 885 - mtu = cork->fragsize; 885 + mtu = cork->gso_size ? IP_MAX_MTU : cork->fragsize; 886 + 886 887 if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP && 887 888 sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) 888 889 tskey = sk->sk_tskey++; ··· 907 906 if (transhdrlen && 908 907 length + fragheaderlen <= mtu && 909 908 rt->dst.dev->features & (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM) && 910 - !(flags & MSG_MORE) && 909 + (!(flags & MSG_MORE) || cork->gso_size) && 911 910 !exthdrlen) 912 911 csummode = CHECKSUM_PARTIAL; 913 912 ··· 1136 1135 *rtp = NULL; 1137 1136 cork->fragsize = ip_sk_use_pmtu(sk) ? 1138 1137 dst_mtu(&rt->dst) : rt->dst.dev->mtu; 1138 + 1139 + cork->gso_size = sk->sk_type == SOCK_DGRAM ? ipc->gso_size : 0; 1139 1140 cork->dst = &rt->dst; 1140 1141 cork->length = 0; 1141 1142 cork->ttl = ipc->ttl; ··· 1217 1214 return -EOPNOTSUPP; 1218 1215 1219 1216 hh_len = LL_RESERVED_SPACE(rt->dst.dev); 1220 - mtu = cork->fragsize; 1217 + mtu = cork->gso_size ? IP_MAX_MTU : cork->fragsize; 1221 1218 1222 1219 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); 1223 1220 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
+30 -3
net/ipv4/udp.c
··· 757 757 } 758 758 EXPORT_SYMBOL(udp_set_csum); 759 759 760 - static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4) 760 + static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4, 761 + struct inet_cork *cork) 761 762 { 762 763 struct sock *sk = skb->sk; 763 764 struct inet_sock *inet = inet_sk(sk); ··· 777 776 uh->dest = fl4->fl4_dport; 778 777 uh->len = htons(len); 779 778 uh->check = 0; 779 + 780 + if (cork->gso_size) { 781 + const int hlen = skb_network_header_len(skb) + 782 + sizeof(struct udphdr); 783 + 784 + if (hlen + cork->gso_size > cork->fragsize) 785 + return -EINVAL; 786 + if (skb->len > cork->gso_size * UDP_MAX_SEGMENTS) 787 + return -EINVAL; 788 + if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite) 789 + return -EIO; 790 + 791 + skb_shinfo(skb)->gso_size = cork->gso_size; 792 + skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4; 793 + } 780 794 781 795 if (is_udplite) /* UDP-Lite */ 782 796 csum = udplite_csum(skb); ··· 844 828 if (!skb) 845 829 goto out; 846 830 847 - err = udp_send_skb(skb, fl4); 831 + err = udp_send_skb(skb, fl4, &inet->cork.base); 848 832 849 833 out: 850 834 up->len = 0; ··· 938 922 ipc.sockc.tsflags = sk->sk_tsflags; 939 923 ipc.addr = inet->inet_saddr; 940 924 ipc.oif = sk->sk_bound_dev_if; 925 + ipc.gso_size = up->gso_size; 941 926 942 927 if (msg->msg_controllen) { 943 928 err = ip_cmsg_send(sk, msg, &ipc, sk->sk_family == AF_INET6); ··· 1054 1037 &cork, msg->msg_flags); 1055 1038 err = PTR_ERR(skb); 1056 1039 if (!IS_ERR_OR_NULL(skb)) 1057 - err = udp_send_skb(skb, fl4); 1040 + err = udp_send_skb(skb, fl4, &cork); 1058 1041 goto out; 1059 1042 } 1060 1043 ··· 2384 2367 up->no_check6_rx = valbool; 2385 2368 break; 2386 2369 2370 + case UDP_SEGMENT: 2371 + if (val < 0 || val > USHRT_MAX) 2372 + return -EINVAL; 2373 + up->gso_size = val; 2374 + break; 2375 + 2387 2376 /* 2388 2377 * UDP-Lite's partial checksum coverage (RFC 3828). 2389 2378 */ ··· 2478 2455 2479 2456 case UDP_NO_CHECK6_RX: 2480 2457 val = up->no_check6_rx; 2458 + break; 2459 + 2460 + case UDP_SEGMENT: 2461 + val = up->gso_size; 2481 2462 break; 2482 2463 2483 2464 /* The following two cannot be changed on UDP sockets, the return is
+4 -2
net/ipv6/ip6_output.c
··· 1240 1240 if (mtu < IPV6_MIN_MTU) 1241 1241 return -EINVAL; 1242 1242 cork->base.fragsize = mtu; 1243 + cork->base.gso_size = sk->sk_type == SOCK_DGRAM ? ipc6->gso_size : 0; 1244 + 1243 1245 if (dst_allfrag(xfrm_dst_path(&rt->dst))) 1244 1246 cork->base.flags |= IPCORK_ALLFRAG; 1245 1247 cork->base.length = 0; ··· 1283 1281 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len; 1284 1282 } 1285 1283 1286 - mtu = cork->fragsize; 1284 + mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize; 1287 1285 orig_mtu = mtu; 1288 1286 1289 1287 hh_len = LL_RESERVED_SPACE(rt->dst.dev); ··· 1331 1329 if (transhdrlen && sk->sk_protocol == IPPROTO_UDP && 1332 1330 headersize == sizeof(struct ipv6hdr) && 1333 1331 length <= mtu - headersize && 1334 - !(flags & MSG_MORE) && 1332 + (!(flags & MSG_MORE) || cork->gso_size) && 1335 1333 rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM)) 1336 1334 csummode = CHECKSUM_PARTIAL; 1337 1335
+20 -3
net/ipv6/udp.c
··· 1023 1023 * Sending 1024 1024 */ 1025 1025 1026 - static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6) 1026 + static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6, 1027 + struct inet_cork *cork) 1027 1028 { 1028 1029 struct sock *sk = skb->sk; 1029 1030 struct udphdr *uh; ··· 1042 1041 uh->dest = fl6->fl6_dport; 1043 1042 uh->len = htons(len); 1044 1043 uh->check = 0; 1044 + 1045 + if (cork->gso_size) { 1046 + const int hlen = skb_network_header_len(skb) + 1047 + sizeof(struct udphdr); 1048 + 1049 + if (hlen + cork->gso_size > cork->fragsize) 1050 + return -EINVAL; 1051 + if (skb->len > cork->gso_size * UDP_MAX_SEGMENTS) 1052 + return -EINVAL; 1053 + if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite) 1054 + return -EIO; 1055 + 1056 + skb_shinfo(skb)->gso_size = cork->gso_size; 1057 + skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4; 1058 + } 1045 1059 1046 1060 if (is_udplite) 1047 1061 csum = udplite_csum(skb); ··· 1109 1093 if (!skb) 1110 1094 goto out; 1111 1095 1112 - err = udp_v6_send_skb(skb, &fl6); 1096 + err = udp_v6_send_skb(skb, &fl6, &inet_sk(sk)->cork.base); 1113 1097 1114 1098 out: 1115 1099 up->len = 0; ··· 1143 1127 ipc6.hlimit = -1; 1144 1128 ipc6.tclass = -1; 1145 1129 ipc6.dontfrag = -1; 1130 + ipc6.gso_size = up->gso_size; 1146 1131 sockc.tsflags = sk->sk_tsflags; 1147 1132 1148 1133 /* destination address check */ ··· 1350 1333 msg->msg_flags, &cork, &sockc); 1351 1334 err = PTR_ERR(skb); 1352 1335 if (!IS_ERR_OR_NULL(skb)) 1353 - err = udp_v6_send_skb(skb, &fl6); 1336 + err = udp_v6_send_skb(skb, &fl6, &cork.base); 1354 1337 goto out; 1355 1338 } 1356 1339