Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

mptcp: borrow forward memory from subflow

In the MPTCP receive path, we release the subflow allocated fwd
memory just to allocate it again shortly after for the msk.

That could increases the failures chances, especially when we will
add backlog processing, with other actions could consume the just
released memory before the msk socket has a chance to do the
rcv allocation.

Replace the skb_orphan() call with an open-coded variant that
explicitly borrows, the fwd memory from the subflow socket instead
of releasing it.

The borrowed memory does not have PAGE_SIZE granularity; rounding to
the page size will make the fwd allocated memory higher than what is
strictly required and could make the incoming subflow fwd mem
consistently negative. Instead, keep track of the accumulated frag and
borrow the full page at subflow close time.

This allow removing the last drop in the TCP to MPTCP transition and
the associated, now unused, MIB.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Mat Martineau <martineau@kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20251121-net-next-mptcp-memcg-backlog-imp-v1-12-1f34b6c1e0b1@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

authored by

Paolo Abeni and committed by
Jakub Kicinski
9db5b3ce 0eeb372d

+46 -11
+3 -1
net/mptcp/fastopen.c
··· 32 32 /* dequeue the skb from sk receive queue */ 33 33 __skb_unlink(skb, &ssk->sk_receive_queue); 34 34 skb_ext_reset(skb); 35 - skb_orphan(skb); 35 + 36 + mptcp_subflow_lend_fwdmem(subflow, skb); 36 37 37 38 /* We copy the fastopen data, but that don't belong to the mptcp sequence 38 39 * space, need to offset it in the subflow sequence, see mptcp_subflow_get_map_offset() ··· 51 50 mptcp_data_lock(sk); 52 51 DEBUG_NET_WARN_ON_ONCE(sock_owned_by_user_nocheck(sk)); 53 52 53 + mptcp_borrow_fwdmem(sk, skb); 54 54 skb_set_owner_r(skb, sk); 55 55 __skb_queue_tail(&sk->sk_receive_queue, skb); 56 56 mptcp_sk(sk)->bytes_received += skb->len;
-1
net/mptcp/mib.c
··· 71 71 SNMP_MIB_ITEM("MPFastcloseRx", MPTCP_MIB_MPFASTCLOSERX), 72 72 SNMP_MIB_ITEM("MPRstTx", MPTCP_MIB_MPRSTTX), 73 73 SNMP_MIB_ITEM("MPRstRx", MPTCP_MIB_MPRSTRX), 74 - SNMP_MIB_ITEM("RcvPruned", MPTCP_MIB_RCVPRUNED), 75 74 SNMP_MIB_ITEM("SubflowStale", MPTCP_MIB_SUBFLOWSTALE), 76 75 SNMP_MIB_ITEM("SubflowRecover", MPTCP_MIB_SUBFLOWRECOVER), 77 76 SNMP_MIB_ITEM("SndWndShared", MPTCP_MIB_SNDWNDSHARED),
-1
net/mptcp/mib.h
··· 70 70 MPTCP_MIB_MPFASTCLOSERX, /* Received a MP_FASTCLOSE */ 71 71 MPTCP_MIB_MPRSTTX, /* Transmit a MP_RST */ 72 72 MPTCP_MIB_MPRSTRX, /* Received a MP_RST */ 73 - MPTCP_MIB_RCVPRUNED, /* Incoming packet dropped due to memory limit */ 74 73 MPTCP_MIB_SUBFLOWSTALE, /* Subflows entered 'stale' status */ 75 74 MPTCP_MIB_SUBFLOWRECOVER, /* Subflows returned to active status after being stale */ 76 75 MPTCP_MIB_SNDWNDSHARED, /* Subflow snd wnd is overridden by msk's one */
+15 -8
net/mptcp/protocol.c
··· 358 358 static void mptcp_init_skb(struct sock *ssk, struct sk_buff *skb, int offset, 359 359 int copy_len) 360 360 { 361 - const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 361 + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 362 362 bool has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp; 363 363 364 364 /* the skb map_seq accounts for the skb offset: ··· 383 383 struct mptcp_sock *msk = mptcp_sk(sk); 384 384 struct sk_buff *tail; 385 385 386 - /* try to fetch required memory from subflow */ 387 - if (!sk_rmem_schedule(sk, skb, skb->truesize)) { 388 - MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RCVPRUNED); 389 - goto drop; 390 - } 386 + mptcp_borrow_fwdmem(sk, skb); 391 387 392 388 if (MPTCP_SKB_CB(skb)->map_seq == msk->ack_seq) { 393 389 /* in sequence */ ··· 405 409 * will retransmit as needed, if needed. 406 410 */ 407 411 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA); 408 - drop: 409 412 mptcp_drop(sk, skb); 410 413 return false; 411 414 } ··· 705 710 size_t len = skb->len - offset; 706 711 707 712 mptcp_init_skb(ssk, skb, offset, len); 708 - skb_orphan(skb); 713 + mptcp_subflow_lend_fwdmem(subflow, skb); 709 714 ret = __mptcp_move_skb(sk, skb) || ret; 710 715 seq += len; 711 716 ··· 2431 2436 { 2432 2437 struct mptcp_sock *msk = mptcp_sk(sk); 2433 2438 bool dispose_it, need_push = false; 2439 + int fwd_remaining; 2434 2440 2435 2441 /* Do not pass RX data to the msk, even if the subflow socket is not 2436 2442 * going to be freed (i.e. even for the first subflow on graceful ··· 2439 2443 */ 2440 2444 lock_sock_nested(ssk, SINGLE_DEPTH_NESTING); 2441 2445 subflow->closing = 1; 2446 + 2447 + /* Borrow the fwd allocated page left-over; fwd memory for the subflow 2448 + * could be negative at this point, but will be reach zero soon - when 2449 + * the data allocated using such fragment will be freed. 2450 + */ 2451 + if (subflow->lent_mem_frag) { 2452 + fwd_remaining = PAGE_SIZE - subflow->lent_mem_frag; 2453 + sk_forward_alloc_add(sk, fwd_remaining); 2454 + sk_forward_alloc_add(ssk, -fwd_remaining); 2455 + subflow->lent_mem_frag = 0; 2456 + } 2442 2457 2443 2458 /* If the first subflow moved to a close state before accept, e.g. due 2444 2459 * to an incoming reset or listener shutdown, the subflow socket is
+28
net/mptcp/protocol.h
··· 547 547 bool scheduled; 548 548 bool pm_listener; /* a listener managed by the kernel PM? */ 549 549 bool fully_established; /* path validated */ 550 + u32 lent_mem_frag; 550 551 u32 remote_nonce; 551 552 u64 thmac; 552 553 u32 local_nonce; ··· 645 644 646 645 reason = sk_rst_convert_mptcp_reason(subflow->reset_reason); 647 646 tcp_send_active_reset(sk, GFP_ATOMIC, reason); 647 + } 648 + 649 + /* Made the fwd mem carried by the given skb available to the msk, 650 + * To be paired with a previous mptcp_subflow_lend_fwdmem() before freeing 651 + * the skb or setting the skb ownership. 652 + */ 653 + static inline void mptcp_borrow_fwdmem(struct sock *sk, struct sk_buff *skb) 654 + { 655 + struct sock *ssk = skb->sk; 656 + 657 + /* The subflow just lend the skb fwd memory, and we know that the skb 658 + * is only accounted on the incoming subflow rcvbuf. 659 + */ 660 + DEBUG_NET_WARN_ON_ONCE(skb->destructor); 661 + skb->sk = NULL; 662 + sk_forward_alloc_add(sk, skb->truesize); 663 + atomic_sub(skb->truesize, &ssk->sk_rmem_alloc); 664 + } 665 + 666 + static inline void 667 + mptcp_subflow_lend_fwdmem(struct mptcp_subflow_context *subflow, 668 + struct sk_buff *skb) 669 + { 670 + int frag = (subflow->lent_mem_frag + skb->truesize) & (PAGE_SIZE - 1); 671 + 672 + skb->destructor = NULL; 673 + subflow->lent_mem_frag = frag; 648 674 } 649 675 650 676 static inline u64