Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

tcp: make connect() mem charging friendly

While working on sk_forward_alloc problems reported by Denys
Fedoryshchenko, we found that tcp connect() (and fastopen) do not call
sk_wmem_schedule() for SYN packet (and/or SYN/DATA packet), so
sk_forward_alloc is negative while connect is in progress.

We can fix this by calling regular sk_stream_alloc_skb() both for the
SYN packet (in tcp_connect()) and the syn_data packet in
tcp_send_syn_data()

Then, tcp_send_syn_data() can avoid copying syn_data as we simply
can manipulate syn_data->cb[] to remove SYN flag (and increment seq)

Instead of open coding memcpy_fromiovecend(), simply use this helper.

This leaves in socket write queue clean fast clone skbs.

This was tested against our fastopen packetdrill tests.

Reported-by: Denys Fedoryshchenko <nuclearcat@nuclearcat.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

Eric Dumazet and committed by
David S. Miller
355a901e baeababb

+29 -41
+29 -41
net/ipv4/tcp_output.c
··· 3011 3011 { 3012 3012 struct tcp_sock *tp = tcp_sk(sk); 3013 3013 struct tcp_fastopen_request *fo = tp->fastopen_req; 3014 - int syn_loss = 0, space, i, err = 0, iovlen = fo->data->msg_iovlen; 3015 - struct sk_buff *syn_data = NULL, *data; 3014 + int syn_loss = 0, space, err = 0; 3016 3015 unsigned long last_syn_loss = 0; 3016 + struct sk_buff *syn_data; 3017 3017 3018 3018 tp->rx_opt.mss_clamp = tp->advmss; /* If MSS is not cached */ 3019 3019 tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie, ··· 3044 3044 /* limit to order-0 allocations */ 3045 3045 space = min_t(size_t, space, SKB_MAX_HEAD(MAX_TCP_HEADER)); 3046 3046 3047 - syn_data = skb_copy_expand(syn, MAX_TCP_HEADER, space, 3048 - sk->sk_allocation); 3049 - if (syn_data == NULL) 3047 + syn_data = sk_stream_alloc_skb(sk, space, sk->sk_allocation); 3048 + if (!syn_data) 3050 3049 goto fallback; 3051 - 3052 - for (i = 0; i < iovlen && syn_data->len < space; ++i) { 3053 - struct iovec *iov = &fo->data->msg_iov[i]; 3054 - unsigned char __user *from = iov->iov_base; 3055 - int len = iov->iov_len; 3056 - 3057 - if (syn_data->len + len > space) 3058 - len = space - syn_data->len; 3059 - else if (i + 1 == iovlen) 3060 - /* No more data pending in inet_wait_for_connect() */ 3061 - fo->data = NULL; 3062 - 3063 - if (skb_add_data(syn_data, from, len)) 3064 - goto fallback; 3050 + syn_data->ip_summed = CHECKSUM_PARTIAL; 3051 + memcpy(syn_data->cb, syn->cb, sizeof(syn->cb)); 3052 + if (unlikely(memcpy_fromiovecend(skb_put(syn_data, space), 3053 + fo->data->msg_iov, 0, space))) { 3054 + kfree_skb(syn_data); 3055 + goto fallback; 3065 3056 } 3066 3057 3067 - /* Queue a data-only packet after the regular SYN for retransmission */ 3068 - data = pskb_copy(syn_data, sk->sk_allocation); 3069 - if (data == NULL) 3070 - goto fallback; 3071 - TCP_SKB_CB(data)->seq++; 3072 - TCP_SKB_CB(data)->tcp_flags &= ~TCPHDR_SYN; 3073 - TCP_SKB_CB(data)->tcp_flags = (TCPHDR_ACK|TCPHDR_PSH); 3074 - tcp_connect_queue_skb(sk, data); 3075 - fo->copied = data->len; 3058 + /* No more data pending in inet_wait_for_connect() */ 3059 + if (space == fo->size) 3060 + fo->data = NULL; 3061 + fo->copied = space; 3076 3062 3077 - /* syn_data is about to be sent, we need to take current time stamps 3078 - * for the packets that are in write queue : SYN packet and DATA 3063 + tcp_connect_queue_skb(sk, syn_data); 3064 + 3065 + err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation); 3066 + 3067 + syn->skb_mstamp = syn_data->skb_mstamp; 3068 + 3069 + /* Now full SYN+DATA was cloned and sent (or not), 3070 + * remove the SYN from the original skb (syn_data) 3071 + * we keep in write queue in case of a retransmit, as we 3072 + * also have the SYN packet (with no data) in the same queue. 3079 3073 */ 3080 - skb_mstamp_get(&syn->skb_mstamp); 3081 - data->skb_mstamp = syn->skb_mstamp; 3082 - 3083 - if (tcp_transmit_skb(sk, syn_data, 0, sk->sk_allocation) == 0) { 3074 + TCP_SKB_CB(syn_data)->seq++; 3075 + TCP_SKB_CB(syn_data)->tcp_flags = TCPHDR_ACK | TCPHDR_PSH; 3076 + if (!err) { 3084 3077 tp->syn_data = (fo->copied > 0); 3085 3078 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT); 3086 3079 goto done; 3087 3080 } 3088 - syn_data = NULL; 3089 3081 3090 3082 fallback: 3091 3083 /* Send a regular SYN with Fast Open cookie request option */ ··· 3086 3094 err = tcp_transmit_skb(sk, syn, 1, sk->sk_allocation); 3087 3095 if (err) 3088 3096 tp->syn_fastopen = 0; 3089 - kfree_skb(syn_data); 3090 3097 done: 3091 3098 fo->cookie.len = -1; /* Exclude Fast Open option for SYN retries */ 3092 3099 return err; ··· 3105 3114 return 0; 3106 3115 } 3107 3116 3108 - buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation); 3109 - if (unlikely(buff == NULL)) 3117 + buff = sk_stream_alloc_skb(sk, 0, sk->sk_allocation); 3118 + if (unlikely(!buff)) 3110 3119 return -ENOBUFS; 3111 - 3112 - /* Reserve space for headers. */ 3113 - skb_reserve(buff, MAX_TCP_HEADER); 3114 3120 3115 3121 tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN); 3116 3122 tp->retrans_stamp = tcp_time_stamp;