Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'mptcp-various-rare-sending-issues'

Matthieu Baerts says:

====================
mptcp: various rare sending issues

Here are various fixes from Paolo, addressing very occasional issues on
the sending side:

- Patch 1: drop an optimisation that could lead to timeout in case of
race conditions. A fix for up to v5.11.

- Patch 2: fix stream corruption under very specific conditions.
A fix for up to v5.13.

- Patch 3: restore MPTCP-level zero window probe after a recent fix.
A fix for up to v5.16.

- Patch 4: new MIB counter to track MPTCP-level zero windows probe to
help catching issues similar to the one fixed by the previous patch.
====================

Link: https://patch.msgid.link/20251028-net-mptcp-send-timeout-v1-0-38ffff5a9ec8@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

+39 -22
+1
net/mptcp/mib.c
··· 85 85 SNMP_MIB_ITEM("DssFallback", MPTCP_MIB_DSSFALLBACK), 86 86 SNMP_MIB_ITEM("SimultConnectFallback", MPTCP_MIB_SIMULTCONNFALLBACK), 87 87 SNMP_MIB_ITEM("FallbackFailed", MPTCP_MIB_FALLBACKFAILED), 88 + SNMP_MIB_ITEM("WinProbe", MPTCP_MIB_WINPROBE), 88 89 }; 89 90 90 91 /* mptcp_mib_alloc - allocate percpu mib counters
+1
net/mptcp/mib.h
··· 88 88 MPTCP_MIB_DSSFALLBACK, /* Bad or missing DSS */ 89 89 MPTCP_MIB_SIMULTCONNFALLBACK, /* Simultaneous connect */ 90 90 MPTCP_MIB_FALLBACKFAILED, /* Can't fallback due to msk status */ 91 + MPTCP_MIB_WINPROBE, /* MPTCP-level zero window probe */ 91 92 __MPTCP_MIB_MAX 92 93 }; 93 94
+36 -21
net/mptcp/protocol.c
··· 1007 1007 if (WARN_ON_ONCE(!msk->recovery)) 1008 1008 break; 1009 1009 1010 - WRITE_ONCE(msk->first_pending, mptcp_send_next(sk)); 1010 + msk->first_pending = mptcp_send_next(sk); 1011 1011 } 1012 1012 1013 1013 dfrag_clear(sk, dfrag); ··· 1299 1299 if (copy == 0) { 1300 1300 u64 snd_una = READ_ONCE(msk->snd_una); 1301 1301 1302 - if (snd_una != msk->snd_nxt || tcp_write_queue_tail(ssk)) { 1302 + /* No need for zero probe if there are any data pending 1303 + * either at the msk or ssk level; skb is the current write 1304 + * queue tail and can be empty at this point. 1305 + */ 1306 + if (snd_una != msk->snd_nxt || skb->len || 1307 + skb != tcp_send_head(ssk)) { 1303 1308 tcp_remove_empty_skb(ssk); 1304 1309 return 0; 1305 1310 } ··· 1355 1350 mpext->dsn64); 1356 1351 1357 1352 if (zero_window_probe) { 1353 + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_WINPROBE); 1358 1354 mptcp_subflow_ctx(ssk)->rel_write_seq += copy; 1359 1355 mpext->frozen = 1; 1360 1356 if (READ_ONCE(msk->csum_enabled)) ··· 1558 1552 1559 1553 mptcp_update_post_push(msk, dfrag, ret); 1560 1554 } 1561 - WRITE_ONCE(msk->first_pending, mptcp_send_next(sk)); 1555 + msk->first_pending = mptcp_send_next(sk); 1562 1556 1563 1557 if (msk->snd_burst <= 0 || 1564 1558 !sk_stream_memory_free(ssk) || ··· 1918 1912 get_page(dfrag->page); 1919 1913 list_add_tail(&dfrag->list, &msk->rtx_queue); 1920 1914 if (!msk->first_pending) 1921 - WRITE_ONCE(msk->first_pending, dfrag); 1915 + msk->first_pending = dfrag; 1922 1916 } 1923 1917 pr_debug("msk=%p dfrag at seq=%llu len=%u sent=%u new=%d\n", msk, 1924 1918 dfrag->data_seq, dfrag->data_len, dfrag->already_sent, ··· 1951 1945 1952 1946 static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied); 1953 1947 1954 - static int __mptcp_recvmsg_mskq(struct sock *sk, 1955 - struct msghdr *msg, 1956 - size_t len, int flags, 1948 + static int __mptcp_recvmsg_mskq(struct sock *sk, struct msghdr *msg, 1949 + size_t len, int flags, int copied_total, 1957 1950 struct scm_timestamping_internal *tss, 1958 1951 int *cmsg_flags) 1959 1952 { 1960 1953 struct mptcp_sock *msk = mptcp_sk(sk); 1961 1954 struct sk_buff *skb, *tmp; 1955 + int total_data_len = 0; 1962 1956 int copied = 0; 1963 1957 1964 1958 skb_queue_walk_safe(&sk->sk_receive_queue, skb, tmp) { 1965 - u32 offset = MPTCP_SKB_CB(skb)->offset; 1959 + u32 delta, offset = MPTCP_SKB_CB(skb)->offset; 1966 1960 u32 data_len = skb->len - offset; 1967 - u32 count = min_t(size_t, len - copied, data_len); 1961 + u32 count; 1968 1962 int err; 1969 1963 1964 + if (flags & MSG_PEEK) { 1965 + /* skip already peeked skbs */ 1966 + if (total_data_len + data_len <= copied_total) { 1967 + total_data_len += data_len; 1968 + continue; 1969 + } 1970 + 1971 + /* skip the already peeked data in the current skb */ 1972 + delta = copied_total - total_data_len; 1973 + offset += delta; 1974 + data_len -= delta; 1975 + } 1976 + 1977 + count = min_t(size_t, len - copied, data_len); 1970 1978 if (!(flags & MSG_TRUNC)) { 1971 1979 err = skb_copy_datagram_msg(skb, offset, msg, count); 1972 1980 if (unlikely(err < 0)) { ··· 1997 1977 1998 1978 copied += count; 1999 1979 2000 - if (count < data_len) { 2001 - if (!(flags & MSG_PEEK)) { 1980 + if (!(flags & MSG_PEEK)) { 1981 + msk->bytes_consumed += count; 1982 + if (count < data_len) { 2002 1983 MPTCP_SKB_CB(skb)->offset += count; 2003 1984 MPTCP_SKB_CB(skb)->map_seq += count; 2004 - msk->bytes_consumed += count; 1985 + break; 2005 1986 } 2006 - break; 2007 - } 2008 1987 2009 - if (!(flags & MSG_PEEK)) { 2010 1988 /* avoid the indirect call, we know the destructor is sock_rfree */ 2011 1989 skb->destructor = NULL; 2012 1990 skb->sk = NULL; ··· 2012 1994 sk_mem_uncharge(sk, skb->truesize); 2013 1995 __skb_unlink(skb, &sk->sk_receive_queue); 2014 1996 skb_attempt_defer_free(skb); 2015 - msk->bytes_consumed += count; 2016 1997 } 2017 1998 2018 1999 if (copied >= len) ··· 2208 2191 while (copied < len) { 2209 2192 int err, bytes_read; 2210 2193 2211 - bytes_read = __mptcp_recvmsg_mskq(sk, msg, len - copied, flags, &tss, &cmsg_flags); 2194 + bytes_read = __mptcp_recvmsg_mskq(sk, msg, len - copied, flags, 2195 + copied, &tss, &cmsg_flags); 2212 2196 if (unlikely(bytes_read < 0)) { 2213 2197 if (!copied) 2214 2198 copied = bytes_read; ··· 2900 2882 struct mptcp_sock *msk = mptcp_sk(sk); 2901 2883 struct mptcp_data_frag *dtmp, *dfrag; 2902 2884 2903 - WRITE_ONCE(msk->first_pending, NULL); 2885 + msk->first_pending = NULL; 2904 2886 list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) 2905 2887 dfrag_clear(sk, dfrag); 2906 2888 } ··· 3440 3422 3441 3423 void __mptcp_check_push(struct sock *sk, struct sock *ssk) 3442 3424 { 3443 - if (!mptcp_send_head(sk)) 3444 - return; 3445 - 3446 3425 if (!sock_owned_by_user(sk)) 3447 3426 __mptcp_subflow_push_pending(sk, ssk, false); 3448 3427 else
+1 -1
net/mptcp/protocol.h
··· 414 414 { 415 415 const struct mptcp_sock *msk = mptcp_sk(sk); 416 416 417 - return READ_ONCE(msk->first_pending); 417 + return msk->first_pending; 418 418 } 419 419 420 420 static inline struct mptcp_data_frag *mptcp_send_next(struct sock *sk)