Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

inet: Remove explicit write references to sk/inet in ip_append_data

In order to allow simultaneous calls to ip_append_data on the same
socket, it must not modify any shared state in sk or inet (other
than those that are designed to allow that such as atomic counters).

This patch abstracts out write references to sk and inet_sk in
ip_append_data and its friends so that we may use the underlying
code in parallel.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

Herbert Xu and committed by
David S. Miller
1470ddf7 5a2ef920

+154 -107
+14 -9
include/net/inet_sock.h
··· 86 86 return (struct inet_request_sock *)sk; 87 87 } 88 88 89 + struct inet_cork { 90 + unsigned int flags; 91 + unsigned int fragsize; 92 + struct ip_options *opt; 93 + struct dst_entry *dst; 94 + int length; /* Total length of all frames */ 95 + __be32 addr; 96 + struct flowi fl; 97 + struct page *page; 98 + u32 off; 99 + u8 tx_flags; 100 + }; 101 + 89 102 struct ip_mc_socklist; 90 103 struct ipv6_pinfo; 91 104 struct rtable; ··· 156 143 int mc_index; 157 144 __be32 mc_addr; 158 145 struct ip_mc_socklist __rcu *mc_list; 159 - struct { 160 - unsigned int flags; 161 - unsigned int fragsize; 162 - struct ip_options *opt; 163 - struct dst_entry *dst; 164 - int length; /* Total length of all frames */ 165 - __be32 addr; 166 - struct flowi fl; 167 - } cork; 146 + struct inet_cork cork; 168 147 }; 169 148 170 149 #define IPCORK_OPT 1 /* ip-options has been held in ipcork.opt */
+140 -98
net/ipv4/ip_output.c
··· 733 733 } 734 734 735 735 static inline int ip_ufo_append_data(struct sock *sk, 736 + struct sk_buff_head *queue, 736 737 int getfrag(void *from, char *to, int offset, int len, 737 738 int odd, struct sk_buff *skb), 738 739 void *from, int length, int hh_len, int fragheaderlen, ··· 746 745 * device, so create one single skb packet containing complete 747 746 * udp datagram 748 747 */ 749 - if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) { 748 + if ((skb = skb_peek_tail(queue)) == NULL) { 750 749 skb = sock_alloc_send_skb(sk, 751 750 hh_len + fragheaderlen + transhdrlen + 20, 752 751 (flags & MSG_DONTWAIT), &err); ··· 772 771 /* specify the length of each IP datagram fragment */ 773 772 skb_shinfo(skb)->gso_size = mtu - fragheaderlen; 774 773 skb_shinfo(skb)->gso_type = SKB_GSO_UDP; 775 - __skb_queue_tail(&sk->sk_write_queue, skb); 774 + __skb_queue_tail(queue, skb); 776 775 } 777 776 778 777 return skb_append_datato_frags(sk, skb, getfrag, from, 779 778 (length - transhdrlen)); 780 779 } 781 780 782 - /* 783 - * ip_append_data() and ip_append_page() can make one large IP datagram 784 - * from many pieces of data. Each pieces will be holded on the socket 785 - * until ip_push_pending_frames() is called. Each piece can be a page 786 - * or non-page data. 787 - * 788 - * Not only UDP, other transport protocols - e.g. raw sockets - can use 789 - * this interface potentially. 790 - * 791 - * LATER: length must be adjusted by pad at tail, when it is required. 792 - */ 793 - int ip_append_data(struct sock *sk, 794 - int getfrag(void *from, char *to, int offset, int len, 795 - int odd, struct sk_buff *skb), 796 - void *from, int length, int transhdrlen, 797 - struct ipcm_cookie *ipc, struct rtable **rtp, 798 - unsigned int flags) 781 + static int __ip_append_data(struct sock *sk, struct sk_buff_head *queue, 782 + struct inet_cork *cork, 783 + int getfrag(void *from, char *to, int offset, 784 + int len, int odd, struct sk_buff *skb), 785 + void *from, int length, int transhdrlen, 786 + unsigned int flags) 799 787 { 800 788 struct inet_sock *inet = inet_sk(sk); 801 789 struct sk_buff *skb; 802 790 803 - struct ip_options *opt = NULL; 791 + struct ip_options *opt = inet->cork.opt; 804 792 int hh_len; 805 793 int exthdrlen; 806 794 int mtu; ··· 798 808 int offset = 0; 799 809 unsigned int maxfraglen, fragheaderlen; 800 810 int csummode = CHECKSUM_NONE; 801 - struct rtable *rt; 811 + struct rtable *rt = (struct rtable *)cork->dst; 802 812 803 - if (flags&MSG_PROBE) 804 - return 0; 813 + exthdrlen = transhdrlen ? rt->dst.header_len : 0; 814 + length += exthdrlen; 815 + transhdrlen += exthdrlen; 816 + mtu = inet->cork.fragsize; 805 817 806 - if (skb_queue_empty(&sk->sk_write_queue)) { 807 - /* 808 - * setup for corking. 809 - */ 810 - opt = ipc->opt; 811 - if (opt) { 812 - if (inet->cork.opt == NULL) { 813 - inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation); 814 - if (unlikely(inet->cork.opt == NULL)) 815 - return -ENOBUFS; 816 - } 817 - memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen); 818 - inet->cork.flags |= IPCORK_OPT; 819 - inet->cork.addr = ipc->addr; 820 - } 821 - rt = *rtp; 822 - if (unlikely(!rt)) 823 - return -EFAULT; 824 - /* 825 - * We steal reference to this route, caller should not release it 826 - */ 827 - *rtp = NULL; 828 - inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ? 829 - rt->dst.dev->mtu : 830 - dst_mtu(rt->dst.path); 831 - inet->cork.dst = &rt->dst; 832 - inet->cork.length = 0; 833 - sk->sk_sndmsg_page = NULL; 834 - sk->sk_sndmsg_off = 0; 835 - exthdrlen = rt->dst.header_len; 836 - length += exthdrlen; 837 - transhdrlen += exthdrlen; 838 - } else { 839 - rt = (struct rtable *)inet->cork.dst; 840 - if (inet->cork.flags & IPCORK_OPT) 841 - opt = inet->cork.opt; 842 - 843 - transhdrlen = 0; 844 - exthdrlen = 0; 845 - mtu = inet->cork.fragsize; 846 - } 847 818 hh_len = LL_RESERVED_SPACE(rt->dst.dev); 848 819 849 820 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); 850 821 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; 851 822 852 - if (inet->cork.length + length > 0xFFFF - fragheaderlen) { 823 + if (cork->length + length > 0xFFFF - fragheaderlen) { 853 824 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, 854 825 mtu-exthdrlen); 855 826 return -EMSGSIZE; ··· 826 875 !exthdrlen) 827 876 csummode = CHECKSUM_PARTIAL; 828 877 829 - skb = skb_peek_tail(&sk->sk_write_queue); 878 + skb = skb_peek_tail(queue); 830 879 831 - inet->cork.length += length; 880 + cork->length += length; 832 881 if (((length > mtu) || (skb && skb_is_gso(skb))) && 833 882 (sk->sk_protocol == IPPROTO_UDP) && 834 883 (rt->dst.dev->features & NETIF_F_UFO)) { 835 - err = ip_ufo_append_data(sk, getfrag, from, length, hh_len, 836 - fragheaderlen, transhdrlen, mtu, 837 - flags); 884 + err = ip_ufo_append_data(sk, queue, getfrag, from, length, 885 + hh_len, fragheaderlen, transhdrlen, 886 + mtu, flags); 838 887 if (err) 839 888 goto error; 840 889 return 0; ··· 911 960 else 912 961 /* only the initial fragment is 913 962 time stamped */ 914 - ipc->tx_flags = 0; 963 + cork->tx_flags = 0; 915 964 } 916 965 if (skb == NULL) 917 966 goto error; ··· 922 971 skb->ip_summed = csummode; 923 972 skb->csum = 0; 924 973 skb_reserve(skb, hh_len); 925 - skb_shinfo(skb)->tx_flags = ipc->tx_flags; 974 + skb_shinfo(skb)->tx_flags = cork->tx_flags; 926 975 927 976 /* 928 977 * Find where to start putting bytes. ··· 959 1008 /* 960 1009 * Put the packet on the pending queue. 961 1010 */ 962 - __skb_queue_tail(&sk->sk_write_queue, skb); 1011 + __skb_queue_tail(queue, skb); 963 1012 continue; 964 1013 } 965 1014 ··· 979 1028 } else { 980 1029 int i = skb_shinfo(skb)->nr_frags; 981 1030 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1]; 982 - struct page *page = sk->sk_sndmsg_page; 983 - int off = sk->sk_sndmsg_off; 1031 + struct page *page = cork->page; 1032 + int off = cork->off; 984 1033 unsigned int left; 985 1034 986 1035 if (page && (left = PAGE_SIZE - off) > 0) { ··· 992 1041 goto error; 993 1042 } 994 1043 get_page(page); 995 - skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0); 1044 + skb_fill_page_desc(skb, i, page, off, 0); 996 1045 frag = &skb_shinfo(skb)->frags[i]; 997 1046 } 998 1047 } else if (i < MAX_SKB_FRAGS) { ··· 1003 1052 err = -ENOMEM; 1004 1053 goto error; 1005 1054 } 1006 - sk->sk_sndmsg_page = page; 1007 - sk->sk_sndmsg_off = 0; 1055 + cork->page = page; 1056 + cork->off = 0; 1008 1057 1009 1058 skb_fill_page_desc(skb, i, page, 0, 0); 1010 1059 frag = &skb_shinfo(skb)->frags[i]; ··· 1016 1065 err = -EFAULT; 1017 1066 goto error; 1018 1067 } 1019 - sk->sk_sndmsg_off += copy; 1068 + cork->off += copy; 1020 1069 frag->size += copy; 1021 1070 skb->len += copy; 1022 1071 skb->data_len += copy; ··· 1030 1079 return 0; 1031 1080 1032 1081 error: 1033 - inet->cork.length -= length; 1082 + cork->length -= length; 1034 1083 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); 1035 1084 return err; 1085 + } 1086 + 1087 + static int ip_setup_cork(struct sock *sk, struct inet_cork *cork, 1088 + struct ipcm_cookie *ipc, struct rtable **rtp) 1089 + { 1090 + struct inet_sock *inet = inet_sk(sk); 1091 + struct ip_options *opt; 1092 + struct rtable *rt; 1093 + 1094 + /* 1095 + * setup for corking. 1096 + */ 1097 + opt = ipc->opt; 1098 + if (opt) { 1099 + if (cork->opt == NULL) { 1100 + cork->opt = kmalloc(sizeof(struct ip_options) + 40, 1101 + sk->sk_allocation); 1102 + if (unlikely(cork->opt == NULL)) 1103 + return -ENOBUFS; 1104 + } 1105 + memcpy(cork->opt, opt, sizeof(struct ip_options) + opt->optlen); 1106 + cork->flags |= IPCORK_OPT; 1107 + cork->addr = ipc->addr; 1108 + } 1109 + rt = *rtp; 1110 + if (unlikely(!rt)) 1111 + return -EFAULT; 1112 + /* 1113 + * We steal reference to this route, caller should not release it 1114 + */ 1115 + *rtp = NULL; 1116 + cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ? 1117 + rt->dst.dev->mtu : dst_mtu(rt->dst.path); 1118 + cork->dst = &rt->dst; 1119 + cork->length = 0; 1120 + cork->tx_flags = ipc->tx_flags; 1121 + cork->page = NULL; 1122 + cork->off = 0; 1123 + 1124 + return 0; 1125 + } 1126 + 1127 + /* 1128 + * ip_append_data() and ip_append_page() can make one large IP datagram 1129 + * from many pieces of data. Each pieces will be holded on the socket 1130 + * until ip_push_pending_frames() is called. Each piece can be a page 1131 + * or non-page data. 1132 + * 1133 + * Not only UDP, other transport protocols - e.g. raw sockets - can use 1134 + * this interface potentially. 1135 + * 1136 + * LATER: length must be adjusted by pad at tail, when it is required. 1137 + */ 1138 + int ip_append_data(struct sock *sk, 1139 + int getfrag(void *from, char *to, int offset, int len, 1140 + int odd, struct sk_buff *skb), 1141 + void *from, int length, int transhdrlen, 1142 + struct ipcm_cookie *ipc, struct rtable **rtp, 1143 + unsigned int flags) 1144 + { 1145 + struct inet_sock *inet = inet_sk(sk); 1146 + int err; 1147 + 1148 + if (flags&MSG_PROBE) 1149 + return 0; 1150 + 1151 + if (skb_queue_empty(&sk->sk_write_queue)) { 1152 + err = ip_setup_cork(sk, &inet->cork, ipc, rtp); 1153 + if (err) 1154 + return err; 1155 + } else { 1156 + transhdrlen = 0; 1157 + } 1158 + 1159 + return __ip_append_data(sk, &sk->sk_write_queue, &inet->cork, getfrag, 1160 + from, length, transhdrlen, flags); 1036 1161 } 1037 1162 1038 1163 ssize_t ip_append_page(struct sock *sk, struct page *page, ··· 1254 1227 return err; 1255 1228 } 1256 1229 1257 - static void ip_cork_release(struct inet_sock *inet) 1230 + static void ip_cork_release(struct inet_cork *cork) 1258 1231 { 1259 - inet->cork.flags &= ~IPCORK_OPT; 1260 - kfree(inet->cork.opt); 1261 - inet->cork.opt = NULL; 1262 - dst_release(inet->cork.dst); 1263 - inet->cork.dst = NULL; 1232 + cork->flags &= ~IPCORK_OPT; 1233 + kfree(cork->opt); 1234 + cork->opt = NULL; 1235 + dst_release(cork->dst); 1236 + cork->dst = NULL; 1264 1237 } 1265 1238 1266 1239 /* 1267 1240 * Combined all pending IP fragments on the socket as one IP datagram 1268 1241 * and push them out. 1269 1242 */ 1270 - int ip_push_pending_frames(struct sock *sk) 1243 + static int __ip_push_pending_frames(struct sock *sk, 1244 + struct sk_buff_head *queue, 1245 + struct inet_cork *cork) 1271 1246 { 1272 1247 struct sk_buff *skb, *tmp_skb; 1273 1248 struct sk_buff **tail_skb; 1274 1249 struct inet_sock *inet = inet_sk(sk); 1275 1250 struct net *net = sock_net(sk); 1276 1251 struct ip_options *opt = NULL; 1277 - struct rtable *rt = (struct rtable *)inet->cork.dst; 1252 + struct rtable *rt = (struct rtable *)cork->dst; 1278 1253 struct iphdr *iph; 1279 1254 __be16 df = 0; 1280 1255 __u8 ttl; 1281 1256 int err = 0; 1282 1257 1283 - if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL) 1258 + if ((skb = __skb_dequeue(queue)) == NULL) 1284 1259 goto out; 1285 1260 tail_skb = &(skb_shinfo(skb)->frag_list); 1286 1261 1287 1262 /* move skb->data to ip header from ext header */ 1288 1263 if (skb->data < skb_network_header(skb)) 1289 1264 __skb_pull(skb, skb_network_offset(skb)); 1290 - while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) { 1265 + while ((tmp_skb = __skb_dequeue(queue)) != NULL) { 1291 1266 __skb_pull(tmp_skb, skb_network_header_len(skb)); 1292 1267 *tail_skb = tmp_skb; 1293 1268 tail_skb = &(tmp_skb->next); ··· 1315 1286 ip_dont_fragment(sk, &rt->dst))) 1316 1287 df = htons(IP_DF); 1317 1288 1318 - if (inet->cork.flags & IPCORK_OPT) 1319 - opt = inet->cork.opt; 1289 + if (cork->flags & IPCORK_OPT) 1290 + opt = cork->opt; 1320 1291 1321 1292 if (rt->rt_type == RTN_MULTICAST) 1322 1293 ttl = inet->mc_ttl; ··· 1328 1299 iph->ihl = 5; 1329 1300 if (opt) { 1330 1301 iph->ihl += opt->optlen>>2; 1331 - ip_options_build(skb, opt, inet->cork.addr, rt, 0); 1302 + ip_options_build(skb, opt, cork->addr, rt, 0); 1332 1303 } 1333 1304 iph->tos = inet->tos; 1334 1305 iph->frag_off = df; ··· 1344 1315 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec 1345 1316 * on dst refcount 1346 1317 */ 1347 - inet->cork.dst = NULL; 1318 + cork->dst = NULL; 1348 1319 skb_dst_set(skb, &rt->dst); 1349 1320 1350 1321 if (iph->protocol == IPPROTO_ICMP) ··· 1361 1332 } 1362 1333 1363 1334 out: 1364 - ip_cork_release(inet); 1335 + ip_cork_release(cork); 1365 1336 return err; 1366 1337 1367 1338 error: ··· 1369 1340 goto out; 1370 1341 } 1371 1342 1343 + int ip_push_pending_frames(struct sock *sk) 1344 + { 1345 + return __ip_push_pending_frames(sk, &sk->sk_write_queue, 1346 + &inet_sk(sk)->cork); 1347 + } 1348 + 1372 1349 /* 1373 1350 * Throw away all pending data on the socket. 1374 1351 */ 1375 - void ip_flush_pending_frames(struct sock *sk) 1352 + static void __ip_flush_pending_frames(struct sock *sk, 1353 + struct sk_buff_head *queue, 1354 + struct inet_cork *cork) 1376 1355 { 1377 1356 struct sk_buff *skb; 1378 1357 1379 - while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) 1358 + while ((skb = __skb_dequeue_tail(queue)) != NULL) 1380 1359 kfree_skb(skb); 1381 1360 1382 - ip_cork_release(inet_sk(sk)); 1361 + ip_cork_release(cork); 1362 + } 1363 + 1364 + void ip_flush_pending_frames(struct sock *sk) 1365 + { 1366 + __ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork); 1383 1367 } 1384 1368 1385 1369