[PATCH] tcp: fix TSO sizing bugs

MSS changes can be lost since we preemptively initialize the tso_segs count
for an SKB before we %100 commit to sending it out.

So, by the time we send it out, the tso_size information can be stale due
to PMTU events. This mucks up all of the logic in our send engine, and can
even result in the BUG() triggering in tcp_tso_should_defer().

Another problem we have is that we're storing the tp->mss_cache, not the
SACK block normalized MSS, as the tso_size. That's wrong too.

Signed-off-by: David S. Miller <davem@davemloft.net>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

authored by David S. Miller and committed by Linus Torvalds 846998ae 0c3dba15

+28 -28
+28 -28
net/ipv4/tcp_output.c
··· 403 sk->sk_send_head = skb; 404 } 405 406 - static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb) 407 { 408 - struct tcp_sock *tp = tcp_sk(sk); 409 - 410 - if (skb->len <= tp->mss_cache || 411 !(sk->sk_route_caps & NETIF_F_TSO)) { 412 /* Avoid the costly divide in the normal 413 * non-TSO case. ··· 415 } else { 416 unsigned int factor; 417 418 - factor = skb->len + (tp->mss_cache - 1); 419 - factor /= tp->mss_cache; 420 skb_shinfo(skb)->tso_segs = factor; 421 - skb_shinfo(skb)->tso_size = tp->mss_cache; 422 } 423 } 424 ··· 427 * packet to the list. This won't be called frequently, I hope. 428 * Remember, these are still headerless SKBs at this point. 429 */ 430 - static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len) 431 { 432 struct tcp_sock *tp = tcp_sk(sk); 433 struct sk_buff *buff; ··· 490 } 491 492 /* Fix up tso_factor for both original and new SKB. */ 493 - tcp_set_skb_tso_segs(sk, skb); 494 - tcp_set_skb_tso_segs(sk, buff); 495 496 if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) { 497 tp->lost_out += tcp_skb_pcount(skb); ··· 567 * factor and mss. 568 */ 569 if (tcp_skb_pcount(skb) > 1) 570 - tcp_set_skb_tso_segs(sk, skb); 571 572 return 0; 573 } ··· 732 /* This must be invoked the first time we consider transmitting 733 * SKB onto the wire. 734 */ 735 - static inline int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb) 736 { 737 int tso_segs = tcp_skb_pcount(skb); 738 739 - if (!tso_segs) { 740 - tcp_set_skb_tso_segs(sk, skb); 741 tso_segs = tcp_skb_pcount(skb); 742 } 743 return tso_segs; ··· 817 struct tcp_sock *tp = tcp_sk(sk); 818 unsigned int cwnd_quota; 819 820 - tcp_init_tso_segs(sk, skb); 821 822 if (!tcp_nagle_test(tp, skb, cur_mss, nonagle)) 823 return 0; ··· 854 * know that all the data is in scatter-gather pages, and that the 855 * packet has never been sent out before (and thus is not cloned). 856 */ 857 - static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len) 858 { 859 struct sk_buff *buff; 860 int nlen = skb->len - len; ··· 887 skb_split(skb, buff, len); 888 889 /* Fix up tso_factor for both original and new SKB. */ 890 - tcp_set_skb_tso_segs(sk, skb); 891 - tcp_set_skb_tso_segs(sk, buff); 892 893 /* Link BUFF into the send queue. */ 894 skb_header_release(buff); ··· 976 if (unlikely(!skb)) 977 return 0; 978 979 - tso_segs = tcp_init_tso_segs(sk, skb); 980 cwnd_quota = tcp_cwnd_test(tp, skb); 981 if (unlikely(!cwnd_quota)) 982 goto out; ··· 1006 limit = skb->len - trim; 1007 } 1008 if (skb->len > limit) { 1009 - if (tso_fragment(sk, skb, limit)) 1010 break; 1011 } 1012 } else if (unlikely(skb->len > mss_now)) { 1013 - if (unlikely(tcp_fragment(sk, skb, mss_now))) 1014 break; 1015 } 1016 ··· 1039 skb = sk->sk_send_head; 1040 if (!skb) 1041 break; 1042 - tso_segs = tcp_init_tso_segs(sk, skb); 1043 } 1044 1045 if (likely(sent_pkts)) { ··· 1076 1077 BUG_ON(!skb || skb->len < mss_now); 1078 1079 - tso_segs = tcp_init_tso_segs(sk, skb); 1080 cwnd_quota = tcp_snd_test(sk, skb, mss_now, TCP_NAGLE_PUSH); 1081 1082 if (likely(cwnd_quota)) { ··· 1093 limit = skb->len - trim; 1094 } 1095 if (skb->len > limit) { 1096 - if (unlikely(tso_fragment(sk, skb, limit))) 1097 return; 1098 } 1099 } else if (unlikely(skb->len > mss_now)) { 1100 - if (unlikely(tcp_fragment(sk, skb, mss_now))) 1101 return; 1102 } 1103 ··· 1388 int old_factor = tcp_skb_pcount(skb); 1389 int new_factor; 1390 1391 - if (tcp_fragment(sk, skb, cur_mss)) 1392 return -ENOMEM; /* We'll try again later. */ 1393 1394 /* New SKB created, account for it. */ ··· 1991 skb->len > mss) { 1992 seg_size = min(seg_size, mss); 1993 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; 1994 - if (tcp_fragment(sk, skb, seg_size)) 1995 return -1; 1996 /* SWS override triggered forced fragmentation. 1997 * Disable TSO, the connection is too sick. */ ··· 2000 sk->sk_route_caps &= ~NETIF_F_TSO; 2001 } 2002 } else if (!tcp_skb_pcount(skb)) 2003 - tcp_set_skb_tso_segs(sk, skb); 2004 2005 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; 2006 TCP_SKB_CB(skb)->when = tcp_time_stamp;
··· 403 sk->sk_send_head = skb; 404 } 405 406 + static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now) 407 { 408 + if (skb->len <= mss_now || 409 !(sk->sk_route_caps & NETIF_F_TSO)) { 410 /* Avoid the costly divide in the normal 411 * non-TSO case. ··· 417 } else { 418 unsigned int factor; 419 420 + factor = skb->len + (mss_now - 1); 421 + factor /= mss_now; 422 skb_shinfo(skb)->tso_segs = factor; 423 + skb_shinfo(skb)->tso_size = mss_now; 424 } 425 } 426 ··· 429 * packet to the list. This won't be called frequently, I hope. 430 * Remember, these are still headerless SKBs at this point. 431 */ 432 + static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss_now) 433 { 434 struct tcp_sock *tp = tcp_sk(sk); 435 struct sk_buff *buff; ··· 492 } 493 494 /* Fix up tso_factor for both original and new SKB. */ 495 + tcp_set_skb_tso_segs(sk, skb, mss_now); 496 + tcp_set_skb_tso_segs(sk, buff, mss_now); 497 498 if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) { 499 tp->lost_out += tcp_skb_pcount(skb); ··· 569 * factor and mss. 570 */ 571 if (tcp_skb_pcount(skb) > 1) 572 + tcp_set_skb_tso_segs(sk, skb, tcp_current_mss(sk, 1)); 573 574 return 0; 575 } ··· 734 /* This must be invoked the first time we consider transmitting 735 * SKB onto the wire. 736 */ 737 + static inline int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now) 738 { 739 int tso_segs = tcp_skb_pcount(skb); 740 741 + if (!tso_segs || 742 + (tso_segs > 1 && 743 + skb_shinfo(skb)->tso_size != mss_now)) { 744 + tcp_set_skb_tso_segs(sk, skb, mss_now); 745 tso_segs = tcp_skb_pcount(skb); 746 } 747 return tso_segs; ··· 817 struct tcp_sock *tp = tcp_sk(sk); 818 unsigned int cwnd_quota; 819 820 + tcp_init_tso_segs(sk, skb, cur_mss); 821 822 if (!tcp_nagle_test(tp, skb, cur_mss, nonagle)) 823 return 0; ··· 854 * know that all the data is in scatter-gather pages, and that the 855 * packet has never been sent out before (and thus is not cloned). 856 */ 857 + static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, unsigned int mss_now) 858 { 859 struct sk_buff *buff; 860 int nlen = skb->len - len; ··· 887 skb_split(skb, buff, len); 888 889 /* Fix up tso_factor for both original and new SKB. */ 890 + tcp_set_skb_tso_segs(sk, skb, mss_now); 891 + tcp_set_skb_tso_segs(sk, buff, mss_now); 892 893 /* Link BUFF into the send queue. */ 894 skb_header_release(buff); ··· 976 if (unlikely(!skb)) 977 return 0; 978 979 + tso_segs = tcp_init_tso_segs(sk, skb, mss_now); 980 cwnd_quota = tcp_cwnd_test(tp, skb); 981 if (unlikely(!cwnd_quota)) 982 goto out; ··· 1006 limit = skb->len - trim; 1007 } 1008 if (skb->len > limit) { 1009 + if (tso_fragment(sk, skb, limit, mss_now)) 1010 break; 1011 } 1012 } else if (unlikely(skb->len > mss_now)) { 1013 + if (unlikely(tcp_fragment(sk, skb, mss_now, mss_now))) 1014 break; 1015 } 1016 ··· 1039 skb = sk->sk_send_head; 1040 if (!skb) 1041 break; 1042 + tso_segs = tcp_init_tso_segs(sk, skb, mss_now); 1043 } 1044 1045 if (likely(sent_pkts)) { ··· 1076 1077 BUG_ON(!skb || skb->len < mss_now); 1078 1079 + tso_segs = tcp_init_tso_segs(sk, skb, mss_now); 1080 cwnd_quota = tcp_snd_test(sk, skb, mss_now, TCP_NAGLE_PUSH); 1081 1082 if (likely(cwnd_quota)) { ··· 1093 limit = skb->len - trim; 1094 } 1095 if (skb->len > limit) { 1096 + if (unlikely(tso_fragment(sk, skb, limit, mss_now))) 1097 return; 1098 } 1099 } else if (unlikely(skb->len > mss_now)) { 1100 + if (unlikely(tcp_fragment(sk, skb, mss_now, mss_now))) 1101 return; 1102 } 1103 ··· 1388 int old_factor = tcp_skb_pcount(skb); 1389 int new_factor; 1390 1391 + if (tcp_fragment(sk, skb, cur_mss, cur_mss)) 1392 return -ENOMEM; /* We'll try again later. */ 1393 1394 /* New SKB created, account for it. */ ··· 1991 skb->len > mss) { 1992 seg_size = min(seg_size, mss); 1993 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; 1994 + if (tcp_fragment(sk, skb, seg_size, mss)) 1995 return -1; 1996 /* SWS override triggered forced fragmentation. 1997 * Disable TSO, the connection is too sick. */ ··· 2000 sk->sk_route_caps &= ~NETIF_F_TSO; 2001 } 2002 } else if (!tcp_skb_pcount(skb)) 2003 + tcp_set_skb_tso_segs(sk, skb, mss); 2004 2005 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; 2006 TCP_SKB_CB(skb)->when = tcp_time_stamp;