Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

tcp: schedule EPOLLOUT after a partial sendmsg

For EPOLLET, applications must call sendmsg until they get EAGAIN.
Otherwise, there is no guarantee that EPOLLOUT is sent if there was
a failure upon memory allocation.

As a result on high-speed NICs, userspace observes multiple small
sendmsgs after a partial sendmsg until EAGAIN, since TCP can send
1-2 TSOs in between two sendmsg syscalls:

// One large partial send due to memory allocation failure.
sendmsg(20MB) = 2MB
// Many small sends until EAGAIN.
sendmsg(18MB) = 64KB
sendmsg(17.9MB) = 128KB
sendmsg(17.8MB) = 64KB
...
sendmsg(...) = EAGAIN
// At this point, userspace can assume an EPOLLOUT.

To fix this, set the SOCK_NOSPACE on all partial sendmsg scenarios
to guarantee that we send EPOLLOUT after partial sendmsg.

After this commit userspace can assume that it will receive an EPOLLOUT
after the first partial sendmsg. This EPOLLOUT will benefit from
sk_stream_write_space() logic delaying the EPOLLOUT until significant
space is available in write queue.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

Soheil Hassas Yeganeh and committed by
David S. Miller
afb83012 8ba3c9d1

+9 -11
+9 -11
net/ipv4/tcp.c
··· 1004 1004 !tcp_skb_can_collapse_to(skb)) { 1005 1005 new_segment: 1006 1006 if (!sk_stream_memory_free(sk)) 1007 - goto wait_for_sndbuf; 1007 + goto wait_for_space; 1008 1008 1009 1009 skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, 1010 1010 tcp_rtx_and_write_queues_empty(sk)); 1011 1011 if (!skb) 1012 - goto wait_for_memory; 1012 + goto wait_for_space; 1013 1013 1014 1014 #ifdef CONFIG_TLS_DEVICE 1015 1015 skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED); ··· 1028 1028 goto new_segment; 1029 1029 } 1030 1030 if (!sk_wmem_schedule(sk, copy)) 1031 - goto wait_for_memory; 1031 + goto wait_for_space; 1032 1032 1033 1033 if (can_coalesce) { 1034 1034 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); ··· 1069 1069 tcp_push_one(sk, mss_now); 1070 1070 continue; 1071 1071 1072 - wait_for_sndbuf: 1072 + wait_for_space: 1073 1073 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 1074 - wait_for_memory: 1075 1074 tcp_push(sk, flags & ~MSG_MORE, mss_now, 1076 1075 TCP_NAGLE_PUSH, size_goal); 1077 1076 ··· 1281 1282 1282 1283 new_segment: 1283 1284 if (!sk_stream_memory_free(sk)) 1284 - goto wait_for_sndbuf; 1285 + goto wait_for_space; 1285 1286 1286 1287 if (unlikely(process_backlog >= 16)) { 1287 1288 process_backlog = 0; ··· 1292 1293 skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, 1293 1294 first_skb); 1294 1295 if (!skb) 1295 - goto wait_for_memory; 1296 + goto wait_for_space; 1296 1297 1297 1298 process_backlog++; 1298 1299 skb->ip_summed = CHECKSUM_PARTIAL; ··· 1325 1326 struct page_frag *pfrag = sk_page_frag(sk); 1326 1327 1327 1328 if (!sk_page_frag_refill(sk, pfrag)) 1328 - goto wait_for_memory; 1329 + goto wait_for_space; 1329 1330 1330 1331 if (!skb_can_coalesce(skb, i, pfrag->page, 1331 1332 pfrag->offset)) { ··· 1339 1340 copy = min_t(int, copy, pfrag->size - pfrag->offset); 1340 1341 1341 1342 if (!sk_wmem_schedule(sk, copy)) 1342 - goto wait_for_memory; 1343 + goto wait_for_space; 1343 1344 1344 1345 err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb, 1345 1346 pfrag->page, ··· 1392 1393 tcp_push_one(sk, mss_now); 1393 1394 continue; 1394 1395 1395 - wait_for_sndbuf: 1396 + wait_for_space: 1396 1397 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 1397 - wait_for_memory: 1398 1398 if (copied) 1399 1399 tcp_push(sk, flags & ~MSG_MORE, mss_now, 1400 1400 TCP_NAGLE_PUSH, size_goal);