Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'tcp-tx-headless'

Eric Dumazet says:

====================
tcp: tx path fully headless

This series completes transition of TCP stack tx path
to headless packets : All payload now reside in page frags,
never in skb->head.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>

+40 -48
+2 -1
include/net/tcp.h
··· 333 333 int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset, 334 334 size_t size, int flags); 335 335 int tcp_send_mss(struct sock *sk, int *size_goal, int flags); 336 + int tcp_wmem_schedule(struct sock *sk, int copy); 336 337 void tcp_push(struct sock *sk, int flags, int mss_now, int nonagle, 337 338 int size_goal); 338 339 void tcp_release_cb(struct sock *sk); ··· 350 349 ssize_t tcp_splice_read(struct socket *sk, loff_t *ppos, 351 350 struct pipe_inode_info *pipe, size_t len, 352 351 unsigned int flags); 353 - struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp, 352 + struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, gfp_t gfp, 354 353 bool force_schedule); 355 354 356 355 void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks);
+4 -4
net/ipv4/tcp.c
··· 858 858 } 859 859 EXPORT_SYMBOL(tcp_splice_read); 860 860 861 - struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp, 861 + struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, gfp_t gfp, 862 862 bool force_schedule) 863 863 { 864 864 struct sk_buff *skb; 865 865 866 - skb = alloc_skb_fclone(size + MAX_TCP_HEADER, gfp); 866 + skb = alloc_skb_fclone(MAX_TCP_HEADER, gfp); 867 867 if (likely(skb)) { 868 868 bool mem_scheduled; 869 869 ··· 957 957 } 958 958 959 959 960 - static int tcp_wmem_schedule(struct sock *sk, int copy) 960 + int tcp_wmem_schedule(struct sock *sk, int copy) 961 961 { 962 962 int left; 963 963 ··· 1178 1178 goto restart; 1179 1179 } 1180 1180 first_skb = tcp_rtx_and_write_queues_empty(sk); 1181 - skb = tcp_stream_alloc_skb(sk, 0, sk->sk_allocation, 1181 + skb = tcp_stream_alloc_skb(sk, sk->sk_allocation, 1182 1182 first_skb); 1183 1183 if (!skb) 1184 1184 goto wait_for_space;
+34 -43
net/ipv4/tcp_output.c
··· 1530 1530 { 1531 1531 struct tcp_sock *tp = tcp_sk(sk); 1532 1532 struct sk_buff *buff; 1533 - int nsize, old_factor; 1533 + int old_factor; 1534 1534 long limit; 1535 1535 int nlen; 1536 1536 u8 flags; ··· 1538 1538 if (WARN_ON(len > skb->len)) 1539 1539 return -EINVAL; 1540 1540 1541 - nsize = skb_headlen(skb) - len; 1542 - if (nsize < 0) 1543 - nsize = 0; 1541 + DEBUG_NET_WARN_ON_ONCE(skb_headlen(skb)); 1544 1542 1545 1543 /* tcp_sendmsg() can overshoot sk_wmem_queued by one full size skb. 1546 1544 * We need some allowance to not penalize applications setting small ··· 1558 1560 return -ENOMEM; 1559 1561 1560 1562 /* Get a new skb... force flag on. */ 1561 - buff = tcp_stream_alloc_skb(sk, nsize, gfp, true); 1563 + buff = tcp_stream_alloc_skb(sk, gfp, true); 1562 1564 if (!buff) 1563 1565 return -ENOMEM; /* We'll just try again later. */ 1564 1566 skb_copy_decrypted(buff, skb); ··· 1566 1568 1567 1569 sk_wmem_queued_add(sk, buff->truesize); 1568 1570 sk_mem_charge(sk, buff->truesize); 1569 - nlen = skb->len - len - nsize; 1571 + nlen = skb->len - len; 1570 1572 buff->truesize += nlen; 1571 1573 skb->truesize -= nlen; 1572 1574 ··· 1624 1626 struct skb_shared_info *shinfo; 1625 1627 int i, k, eat; 1626 1628 1627 - eat = min_t(int, len, skb_headlen(skb)); 1628 - if (eat) { 1629 - __skb_pull(skb, eat); 1630 - len -= eat; 1631 - if (!len) 1632 - return 0; 1633 - } 1629 + DEBUG_NET_WARN_ON_ONCE(skb_headlen(skb)); 1634 1630 eat = len; 1635 1631 k = 0; 1636 1632 shinfo = skb_shinfo(skb); ··· 1663 1671 1664 1672 TCP_SKB_CB(skb)->seq += len; 1665 1673 1666 - if (delta_truesize) { 1667 - skb->truesize -= delta_truesize; 1668 - sk_wmem_queued_add(sk, -delta_truesize); 1669 - if (!skb_zcopy_pure(skb)) 1670 - sk_mem_uncharge(sk, delta_truesize); 1671 - } 1674 + skb->truesize -= delta_truesize; 1675 + sk_wmem_queued_add(sk, -delta_truesize); 1676 + if (!skb_zcopy_pure(skb)) 1677 + sk_mem_uncharge(sk, delta_truesize); 1672 1678 1673 1679 /* Any change of skb->len requires recalculation of tso factor. */ 1674 1680 if (tcp_skb_pcount(skb) > 1) ··· 2116 2126 u8 flags; 2117 2127 2118 2128 /* All of a TSO frame must be composed of paged data. */ 2119 - if (skb->len != skb->data_len) 2120 - return tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE, 2121 - skb, len, mss_now, gfp); 2129 + DEBUG_NET_WARN_ON_ONCE(skb->len != skb->data_len); 2122 2130 2123 - buff = tcp_stream_alloc_skb(sk, 0, gfp, true); 2131 + buff = tcp_stream_alloc_skb(sk, gfp, true); 2124 2132 if (unlikely(!buff)) 2125 2133 return -ENOMEM; 2126 2134 skb_copy_decrypted(buff, skb); ··· 2434 2446 return -1; 2435 2447 2436 2448 /* We're allowed to probe. Build it now. */ 2437 - nskb = tcp_stream_alloc_skb(sk, 0, GFP_ATOMIC, false); 2449 + nskb = tcp_stream_alloc_skb(sk, GFP_ATOMIC, false); 2438 2450 if (!nskb) 2439 2451 return -1; 2440 2452 ··· 2475 2487 } else { 2476 2488 TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags & 2477 2489 ~(TCPHDR_FIN|TCPHDR_PSH); 2478 - if (!skb_shinfo(skb)->nr_frags) { 2479 - skb_pull(skb, copy); 2480 - } else { 2481 - __pskb_trim_head(skb, copy); 2482 - tcp_set_skb_tso_segs(skb, mss_now); 2483 - } 2490 + __pskb_trim_head(skb, copy); 2491 + tcp_set_skb_tso_segs(skb, mss_now); 2484 2492 TCP_SKB_CB(skb)->seq += copy; 2485 2493 } 2486 2494 ··· 3786 3802 struct inet_connection_sock *icsk = inet_csk(sk); 3787 3803 struct tcp_sock *tp = tcp_sk(sk); 3788 3804 struct tcp_fastopen_request *fo = tp->fastopen_req; 3789 - int space, err = 0; 3805 + struct page_frag *pfrag = sk_page_frag(sk); 3790 3806 struct sk_buff *syn_data; 3807 + int space, err = 0; 3791 3808 3792 3809 tp->rx_opt.mss_clamp = tp->advmss; /* If MSS is not cached */ 3793 3810 if (!tcp_fastopen_cookie_check(sk, &tp->rx_opt.mss_clamp, &fo->cookie)) ··· 3807 3822 3808 3823 space = min_t(size_t, space, fo->size); 3809 3824 3810 - /* limit to order-0 allocations */ 3811 - space = min_t(size_t, space, SKB_MAX_HEAD(MAX_TCP_HEADER)); 3812 - 3813 - syn_data = tcp_stream_alloc_skb(sk, space, sk->sk_allocation, false); 3825 + if (space && 3826 + !skb_page_frag_refill(min_t(size_t, space, PAGE_SIZE), 3827 + pfrag, sk->sk_allocation)) 3828 + goto fallback; 3829 + syn_data = tcp_stream_alloc_skb(sk, sk->sk_allocation, false); 3814 3830 if (!syn_data) 3815 3831 goto fallback; 3816 3832 memcpy(syn_data->cb, syn->cb, sizeof(syn->cb)); 3817 3833 if (space) { 3818 - int copied = copy_from_iter(skb_put(syn_data, space), space, 3819 - &fo->data->msg_iter); 3820 - if (unlikely(!copied)) { 3834 + space = min_t(size_t, space, pfrag->size - pfrag->offset); 3835 + space = tcp_wmem_schedule(sk, space); 3836 + } 3837 + if (space) { 3838 + space = copy_page_from_iter(pfrag->page, pfrag->offset, 3839 + space, &fo->data->msg_iter); 3840 + if (unlikely(!space)) { 3821 3841 tcp_skb_tsorted_anchor_cleanup(syn_data); 3822 3842 kfree_skb(syn_data); 3823 3843 goto fallback; 3824 3844 } 3825 - if (copied != space) { 3826 - skb_trim(syn_data, copied); 3827 - space = copied; 3828 - } 3845 + skb_fill_page_desc(syn_data, 0, pfrag->page, 3846 + pfrag->offset, space); 3847 + page_ref_inc(pfrag->page); 3848 + pfrag->offset += space; 3849 + skb_len_add(syn_data, space); 3829 3850 skb_zcopy_set(syn_data, fo->uarg, NULL); 3830 3851 } 3831 3852 /* No more data pending in inet_wait_for_connect() */ ··· 3896 3905 return 0; 3897 3906 } 3898 3907 3899 - buff = tcp_stream_alloc_skb(sk, 0, sk->sk_allocation, true); 3908 + buff = tcp_stream_alloc_skb(sk, sk->sk_allocation, true); 3900 3909 if (unlikely(!buff)) 3901 3910 return -ENOBUFS; 3902 3911