Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

mptcp: introduce mptcp-level backlog

We are soon using it for incoming data processing.
MPTCP can't leverage the sk_backlog, as the latter is processed
before the release callback, and such callback for MPTCP releases
and re-acquire the socket spinlock, breaking the sk_backlog processing
assumption.

Add a skb backlog list inside the mptcp sock struct, and implement
basic helper to transfer packet to and purge such list.

Packets in the backlog are memory accounted and still use the incoming
subflow receive memory, to allow back-pressure. The backlog size is
implicitly bounded to the sum of subflows rcvbuf.

When a subflow is closed, references from the backlog to such sock
are removed.

No packet is currently added to the backlog, so no functional changes
intended here.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Mat Martineau <martineau@kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20251121-net-next-mptcp-memcg-backlog-imp-v1-13-1f34b6c1e0b1@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

authored by

Paolo Abeni and committed by
Jakub Kicinski
ee458a3f 9db5b3ce

+97 -9
+2 -1
net/mptcp/mptcp_diag.c
··· 195 195 struct mptcp_sock *msk = mptcp_sk(sk); 196 196 struct mptcp_info *info = _info; 197 197 198 - r->idiag_rqueue = sk_rmem_alloc_get(sk); 198 + r->idiag_rqueue = sk_rmem_alloc_get(sk) + 199 + READ_ONCE(mptcp_sk(sk)->backlog_len); 199 200 r->idiag_wqueue = sk_wmem_alloc_get(sk); 200 201 201 202 if (inet_sk_state_load(sk) == TCP_LISTEN) {
+76 -2
net/mptcp/protocol.c
··· 659 659 } 660 660 } 661 661 662 + static void __mptcp_add_backlog(struct sock *sk, 663 + struct mptcp_subflow_context *subflow, 664 + struct sk_buff *skb) 665 + { 666 + struct mptcp_sock *msk = mptcp_sk(sk); 667 + struct sk_buff *tail = NULL; 668 + bool fragstolen; 669 + int delta; 670 + 671 + if (unlikely(sk->sk_state == TCP_CLOSE)) { 672 + kfree_skb_reason(skb, SKB_DROP_REASON_SOCKET_CLOSE); 673 + return; 674 + } 675 + 676 + /* Try to coalesce with the last skb in our backlog */ 677 + if (!list_empty(&msk->backlog_list)) 678 + tail = list_last_entry(&msk->backlog_list, struct sk_buff, list); 679 + 680 + if (tail && MPTCP_SKB_CB(skb)->map_seq == MPTCP_SKB_CB(tail)->end_seq && 681 + skb->sk == tail->sk && 682 + __mptcp_try_coalesce(sk, tail, skb, &fragstolen, &delta)) { 683 + skb->truesize -= delta; 684 + kfree_skb_partial(skb, fragstolen); 685 + __mptcp_subflow_lend_fwdmem(subflow, delta); 686 + WRITE_ONCE(msk->backlog_len, msk->backlog_len + delta); 687 + return; 688 + } 689 + 690 + list_add_tail(&skb->list, &msk->backlog_list); 691 + mptcp_subflow_lend_fwdmem(subflow, skb); 692 + WRITE_ONCE(msk->backlog_len, msk->backlog_len + skb->truesize); 693 + } 694 + 662 695 static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, 663 696 struct sock *ssk) 664 697 { ··· 738 705 size_t len = skb->len - offset; 739 706 740 707 mptcp_init_skb(ssk, skb, offset, len); 741 - mptcp_subflow_lend_fwdmem(subflow, skb); 742 - ret = __mptcp_move_skb(sk, skb) || ret; 708 + 709 + if (true) { 710 + mptcp_subflow_lend_fwdmem(subflow, skb); 711 + ret |= __mptcp_move_skb(sk, skb); 712 + } else { 713 + __mptcp_add_backlog(sk, subflow, skb); 714 + } 743 715 seq += len; 744 716 745 717 if (unlikely(map_remaining < len)) { ··· 2569 2531 void mptcp_close_ssk(struct sock *sk, struct sock *ssk, 2570 2532 struct mptcp_subflow_context *subflow) 2571 2533 { 2534 + struct mptcp_sock *msk = mptcp_sk(sk); 2535 + struct sk_buff *skb; 2536 + 2572 2537 /* The first subflow can already be closed and still in the list */ 2573 2538 if (subflow->close_event_done) 2574 2539 return; ··· 2580 2539 2581 2540 if (sk->sk_state == TCP_ESTABLISHED) 2582 2541 mptcp_event(MPTCP_EVENT_SUB_CLOSED, mptcp_sk(sk), ssk, GFP_KERNEL); 2542 + 2543 + /* Remove any reference from the backlog to this ssk; backlog skbs consume 2544 + * space in the msk receive queue, no need to touch sk->sk_rmem_alloc 2545 + */ 2546 + list_for_each_entry(skb, &msk->backlog_list, list) { 2547 + if (skb->sk != ssk) 2548 + continue; 2549 + 2550 + atomic_sub(skb->truesize, &skb->sk->sk_rmem_alloc); 2551 + skb->sk = NULL; 2552 + } 2583 2553 2584 2554 /* subflow aborted before reaching the fully_established status 2585 2555 * attempt the creation of the next subflow ··· 2821 2769 unlock_sock_fast(ssk, slow); 2822 2770 } 2823 2771 2772 + static void mptcp_backlog_purge(struct sock *sk) 2773 + { 2774 + struct mptcp_sock *msk = mptcp_sk(sk); 2775 + struct sk_buff *tmp, *skb; 2776 + LIST_HEAD(backlog); 2777 + 2778 + mptcp_data_lock(sk); 2779 + list_splice_init(&msk->backlog_list, &backlog); 2780 + msk->backlog_len = 0; 2781 + mptcp_data_unlock(sk); 2782 + 2783 + list_for_each_entry_safe(skb, tmp, &backlog, list) { 2784 + mptcp_borrow_fwdmem(sk, skb); 2785 + kfree_skb_reason(skb, SKB_DROP_REASON_SOCKET_CLOSE); 2786 + } 2787 + sk_mem_reclaim(sk); 2788 + } 2789 + 2824 2790 static void mptcp_do_fastclose(struct sock *sk) 2825 2791 { 2826 2792 struct mptcp_subflow_context *subflow, *tmp; 2827 2793 struct mptcp_sock *msk = mptcp_sk(sk); 2828 2794 2829 2795 mptcp_set_state(sk, TCP_CLOSE); 2796 + mptcp_backlog_purge(sk); 2830 2797 2831 2798 /* Explicitly send the fastclose reset as need */ 2832 2799 if (__mptcp_check_fallback(msk)) ··· 2924 2853 INIT_LIST_HEAD(&msk->conn_list); 2925 2854 INIT_LIST_HEAD(&msk->join_list); 2926 2855 INIT_LIST_HEAD(&msk->rtx_queue); 2856 + INIT_LIST_HEAD(&msk->backlog_list); 2927 2857 INIT_WORK(&msk->work, mptcp_worker); 2928 2858 msk->out_of_order_queue = RB_ROOT; 2929 2859 msk->first_pending = NULL; 2930 2860 msk->timer_ival = TCP_RTO_MIN; 2931 2861 msk->scaling_ratio = TCP_DEFAULT_SCALING_RATIO; 2862 + msk->backlog_len = 0; 2932 2863 2933 2864 WRITE_ONCE(msk->first, NULL); 2934 2865 inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss; ··· 3307 3234 struct sock *sk = (struct sock *)msk; 3308 3235 3309 3236 __mptcp_clear_xmit(sk); 3237 + mptcp_backlog_purge(sk); 3310 3238 3311 3239 /* join list will be eventually flushed (with rst) at sock lock release time */ 3312 3240 mptcp_for_each_subflow_safe(msk, subflow, tmp)
+19 -6
net/mptcp/protocol.h
··· 357 357 * allow_infinite_fallback and 358 358 * allow_join 359 359 */ 360 + 361 + struct list_head backlog_list; /* protected by the data lock */ 362 + u32 backlog_len; 360 363 }; 361 364 362 365 #define mptcp_data_lock(sk) spin_lock_bh(&(sk)->sk_lock.slock) ··· 410 407 static inline int __mptcp_space(const struct sock *sk) 411 408 { 412 409 return mptcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) - 410 + READ_ONCE(mptcp_sk(sk)->backlog_len) - 413 411 sk_rmem_alloc_get(sk)); 414 412 } 415 413 ··· 659 655 { 660 656 struct sock *ssk = skb->sk; 661 657 662 - /* The subflow just lend the skb fwd memory, and we know that the skb 663 - * is only accounted on the incoming subflow rcvbuf. 658 + /* The subflow just lend the skb fwd memory; if the subflow meanwhile 659 + * closed, mptcp_close_ssk() already released the ssk rcv memory. 664 660 */ 665 661 DEBUG_NET_WARN_ON_ONCE(skb->destructor); 666 - skb->sk = NULL; 667 662 sk_forward_alloc_add(sk, skb->truesize); 663 + if (!ssk) 664 + return; 665 + 668 666 atomic_sub(skb->truesize, &ssk->sk_rmem_alloc); 667 + skb->sk = NULL; 668 + } 669 + 670 + static inline void 671 + __mptcp_subflow_lend_fwdmem(struct mptcp_subflow_context *subflow, int size) 672 + { 673 + int frag = (subflow->lent_mem_frag + size) & (PAGE_SIZE - 1); 674 + 675 + subflow->lent_mem_frag = frag; 669 676 } 670 677 671 678 static inline void 672 679 mptcp_subflow_lend_fwdmem(struct mptcp_subflow_context *subflow, 673 680 struct sk_buff *skb) 674 681 { 675 - int frag = (subflow->lent_mem_frag + skb->truesize) & (PAGE_SIZE - 1); 676 - 682 + __mptcp_subflow_lend_fwdmem(subflow, skb->truesize); 677 683 skb->destructor = NULL; 678 - subflow->lent_mem_frag = frag; 679 684 } 680 685 681 686 static inline u64