Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'tcp-avoid-sending-too-small-packets'

Eric Dumazet says:

====================
tcp: avoid sending too small packets

tcp_sendmsg() cooks 'large' skbs, that are later split
if needed from tcp_write_xmit().

After a split, the leftover skb size is smaller than the optimal
size, and this causes a performance drop.

In this series, tcp_grow_skb() helper is added to shift
payload from the second skb in the write queue to the first
skb to always send optimal sized skbs.

This increases TSO efficiency, and decreases number of ACK
packets.
====================

Link: https://lore.kernel.org/r/20240418214600.1291486-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

+53 -25
+53 -25
net/ipv4/tcp_output.c
··· 1502 1502 } 1503 1503 1504 1504 /* Initialize TSO segments for a packet. */ 1505 - static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now) 1505 + static int tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now) 1506 1506 { 1507 + int tso_segs; 1508 + 1507 1509 if (skb->len <= mss_now) { 1508 1510 /* Avoid the costly divide in the normal 1509 1511 * non-TSO case. 1510 1512 */ 1511 - tcp_skb_pcount_set(skb, 1); 1512 1513 TCP_SKB_CB(skb)->tcp_gso_size = 0; 1513 - } else { 1514 - tcp_skb_pcount_set(skb, DIV_ROUND_UP(skb->len, mss_now)); 1515 - TCP_SKB_CB(skb)->tcp_gso_size = mss_now; 1514 + tcp_skb_pcount_set(skb, 1); 1515 + return 1; 1516 1516 } 1517 + TCP_SKB_CB(skb)->tcp_gso_size = mss_now; 1518 + tso_segs = DIV_ROUND_UP(skb->len, mss_now); 1519 + tcp_skb_pcount_set(skb, tso_segs); 1520 + return tso_segs; 1517 1521 } 1518 1522 1519 1523 /* Pcount in the middle of the write queue got changed, we need to do various ··· 2077 2073 /* Can at least one segment of SKB be sent right now, according to the 2078 2074 * congestion window rules? If so, return how many segments are allowed. 2079 2075 */ 2080 - static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp, 2081 - const struct sk_buff *skb) 2076 + static u32 tcp_cwnd_test(const struct tcp_sock *tp) 2082 2077 { 2083 2078 u32 in_flight, cwnd, halfcwnd; 2084 - 2085 - /* Don't be strict about the congestion window for the final FIN. */ 2086 - if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) && 2087 - tcp_skb_pcount(skb) == 1) 2088 - return 1; 2089 2079 2090 2080 in_flight = tcp_packets_in_flight(tp); 2091 2081 cwnd = tcp_snd_cwnd(tp); ··· 2101 2103 { 2102 2104 int tso_segs = tcp_skb_pcount(skb); 2103 2105 2104 - if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) { 2105 - tcp_set_skb_tso_segs(skb, mss_now); 2106 - tso_segs = tcp_skb_pcount(skb); 2107 - } 2106 + if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) 2107 + return tcp_set_skb_tso_segs(skb, mss_now); 2108 + 2108 2109 return tso_segs; 2109 2110 } 2110 2111 ··· 2683 2686 tcp_chrono_set(tp, TCP_CHRONO_BUSY); 2684 2687 } 2685 2688 2689 + /* First skb in the write queue is smaller than ideal packet size. 2690 + * Check if we can move payload from the second skb in the queue. 2691 + */ 2692 + static void tcp_grow_skb(struct sock *sk, struct sk_buff *skb, int amount) 2693 + { 2694 + struct sk_buff *next_skb = skb->next; 2695 + unsigned int nlen; 2696 + 2697 + if (tcp_skb_is_last(sk, skb)) 2698 + return; 2699 + 2700 + if (!tcp_skb_can_collapse(skb, next_skb)) 2701 + return; 2702 + 2703 + nlen = min_t(u32, amount, next_skb->len); 2704 + if (!nlen || !skb_shift(skb, next_skb, nlen)) 2705 + return; 2706 + 2707 + TCP_SKB_CB(skb)->end_seq += nlen; 2708 + TCP_SKB_CB(next_skb)->seq += nlen; 2709 + 2710 + if (!next_skb->len) { 2711 + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq; 2712 + TCP_SKB_CB(skb)->eor = TCP_SKB_CB(next_skb)->eor; 2713 + TCP_SKB_CB(skb)->tcp_flags |= TCP_SKB_CB(next_skb)->tcp_flags; 2714 + tcp_unlink_write_queue(next_skb, sk); 2715 + tcp_wmem_free_skb(sk, next_skb); 2716 + } 2717 + } 2718 + 2686 2719 /* This routine writes packets to the network. It advances the 2687 2720 * send_head. This happens as incoming acks open up the remote 2688 2721 * window for us. ··· 2733 2706 struct tcp_sock *tp = tcp_sk(sk); 2734 2707 struct sk_buff *skb; 2735 2708 unsigned int tso_segs, sent_pkts; 2736 - int cwnd_quota; 2709 + u32 cwnd_quota, max_segs; 2737 2710 int result; 2738 2711 bool is_cwnd_limited = false, is_rwnd_limited = false; 2739 - u32 max_segs; 2740 2712 2741 2713 sent_pkts = 0; 2742 2714 ··· 2753 2727 max_segs = tcp_tso_segs(sk, mss_now); 2754 2728 while ((skb = tcp_send_head(sk))) { 2755 2729 unsigned int limit; 2730 + int missing_bytes; 2756 2731 2757 2732 if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) { 2758 2733 /* "skb_mstamp_ns" is used as a start point for the retransmit timer */ ··· 2767 2740 if (tcp_pacing_check(sk)) 2768 2741 break; 2769 2742 2770 - tso_segs = tcp_init_tso_segs(skb, mss_now); 2771 - BUG_ON(!tso_segs); 2772 - 2773 - cwnd_quota = tcp_cwnd_test(tp, skb); 2743 + cwnd_quota = tcp_cwnd_test(tp); 2774 2744 if (!cwnd_quota) { 2775 2745 if (push_one == 2) 2776 2746 /* Force out a loss probe pkt. */ ··· 2775 2751 else 2776 2752 break; 2777 2753 } 2754 + cwnd_quota = min(cwnd_quota, max_segs); 2755 + missing_bytes = cwnd_quota * mss_now - skb->len; 2756 + if (missing_bytes > 0) 2757 + tcp_grow_skb(sk, skb, missing_bytes); 2758 + 2759 + tso_segs = tcp_set_skb_tso_segs(skb, mss_now); 2778 2760 2779 2761 if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) { 2780 2762 is_rwnd_limited = true; ··· 2802 2772 limit = mss_now; 2803 2773 if (tso_segs > 1 && !tcp_urg_mode(tp)) 2804 2774 limit = tcp_mss_split_point(sk, skb, mss_now, 2805 - min_t(unsigned int, 2806 - cwnd_quota, 2807 - max_segs), 2775 + cwnd_quota, 2808 2776 nonagle); 2809 2777 2810 2778 if (skb->len > limit &&