Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'mptcp-implement-read_sock-and-splice_read'

Matthieu Baerts says:

====================
mptcp: implement .read_sock and .splice_read

This series is a preparation work for future in-kernel MPTCP sockets
usage. Here, two interfaces are implemented: read_sock and splice_read.
As a result of this series, splice() with MPTCP sockets -- which was
already supported -- is now improved.

- Patches 1-2: .read_sock implementation

- Patches 3-4: .splice_read implementation

- Patches 5-6: validate splice() support with MPTCP sockets.
====================

Link: https://patch.msgid.link/20260130-net-next-mptcp-splice-v2-0-31332ba70d7f@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

+308 -19
+11
include/net/tcp.h
··· 347 347 #define TCP_DEC_STATS(net, field) SNMP_DEC_STATS((net)->mib.tcp_statistics, field) 348 348 #define TCP_ADD_STATS(net, field, val) SNMP_ADD_STATS((net)->mib.tcp_statistics, field, val) 349 349 350 + /* 351 + * TCP splice context 352 + */ 353 + struct tcp_splice_state { 354 + struct pipe_inode_info *pipe; 355 + size_t len; 356 + unsigned int flags; 357 + }; 358 + 350 359 void tcp_tsq_work_init(void); 351 360 352 361 int tcp_v4_err(struct sk_buff *skb, u32); ··· 387 378 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp); 388 379 void tcp_twsk_destructor(struct sock *sk); 389 380 void tcp_twsk_purge(struct list_head *net_exit_list); 381 + int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, 382 + unsigned int offset, size_t len); 390 383 ssize_t tcp_splice_read(struct socket *sk, loff_t *ppos, 391 384 struct pipe_inode_info *pipe, size_t len, 392 385 unsigned int flags);
+2 -11
net/ipv4/tcp.c
··· 319 319 EXPORT_IPV6_MOD(tcp_sockets_allocated); 320 320 321 321 /* 322 - * TCP splice context 323 - */ 324 - struct tcp_splice_state { 325 - struct pipe_inode_info *pipe; 326 - size_t len; 327 - unsigned int flags; 328 - }; 329 - 330 - /* 331 322 * Pressure flag: try to collapse. 332 323 * Technical note: it is used by multiple contexts non atomically. 333 324 * All the __sk_mem_schedule() is of this nature: accounting ··· 782 791 __tcp_push_pending_frames(sk, mss_now, nonagle); 783 792 } 784 793 785 - static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, 786 - unsigned int offset, size_t len) 794 + int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, 795 + unsigned int offset, size_t len) 787 796 { 788 797 struct tcp_splice_state *tss = rd_desc->arg.data; 789 798 int ret;
+211 -7
net/mptcp/protocol.c
··· 1995 1995 1996 1996 static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied); 1997 1997 1998 + static void mptcp_eat_recv_skb(struct sock *sk, struct sk_buff *skb) 1999 + { 2000 + /* avoid the indirect call, we know the destructor is sock_rfree */ 2001 + skb->destructor = NULL; 2002 + skb->sk = NULL; 2003 + atomic_sub(skb->truesize, &sk->sk_rmem_alloc); 2004 + sk_mem_uncharge(sk, skb->truesize); 2005 + __skb_unlink(skb, &sk->sk_receive_queue); 2006 + skb_attempt_defer_free(skb); 2007 + } 2008 + 1998 2009 static int __mptcp_recvmsg_mskq(struct sock *sk, struct msghdr *msg, 1999 2010 size_t len, int flags, int copied_total, 2000 2011 struct scm_timestamping_internal *tss, ··· 2060 2049 break; 2061 2050 } 2062 2051 2063 - /* avoid the indirect call, we know the destructor is sock_rfree */ 2064 - skb->destructor = NULL; 2065 - skb->sk = NULL; 2066 - atomic_sub(skb->truesize, &sk->sk_rmem_alloc); 2067 - sk_mem_uncharge(sk, skb->truesize); 2068 - __skb_unlink(skb, &sk->sk_receive_queue); 2069 - skb_attempt_defer_free(skb); 2052 + mptcp_eat_recv_skb(sk, skb); 2070 2053 } 2071 2054 2072 2055 if (copied >= len) ··· 4317 4312 return mask; 4318 4313 } 4319 4314 4315 + static struct sk_buff *mptcp_recv_skb(struct sock *sk, u32 *off) 4316 + { 4317 + struct mptcp_sock *msk = mptcp_sk(sk); 4318 + struct sk_buff *skb; 4319 + u32 offset; 4320 + 4321 + if (!list_empty(&msk->backlog_list)) 4322 + mptcp_move_skbs(sk); 4323 + 4324 + while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) { 4325 + offset = MPTCP_SKB_CB(skb)->offset; 4326 + if (offset < skb->len) { 4327 + *off = offset; 4328 + return skb; 4329 + } 4330 + mptcp_eat_recv_skb(sk, skb); 4331 + } 4332 + return NULL; 4333 + } 4334 + 4335 + /* 4336 + * Note: 4337 + * - It is assumed that the socket was locked by the caller. 4338 + */ 4339 + static int __mptcp_read_sock(struct sock *sk, read_descriptor_t *desc, 4340 + sk_read_actor_t recv_actor, bool noack) 4341 + { 4342 + struct mptcp_sock *msk = mptcp_sk(sk); 4343 + struct sk_buff *skb; 4344 + int copied = 0; 4345 + u32 offset; 4346 + 4347 + msk_owned_by_me(msk); 4348 + 4349 + if (sk->sk_state == TCP_LISTEN) 4350 + return -ENOTCONN; 4351 + while ((skb = mptcp_recv_skb(sk, &offset)) != NULL) { 4352 + u32 data_len = skb->len - offset; 4353 + int count; 4354 + u32 size; 4355 + 4356 + size = min_t(size_t, data_len, INT_MAX); 4357 + count = recv_actor(desc, skb, offset, size); 4358 + if (count <= 0) { 4359 + if (!copied) 4360 + copied = count; 4361 + break; 4362 + } 4363 + 4364 + copied += count; 4365 + 4366 + msk->bytes_consumed += count; 4367 + if (count < data_len) { 4368 + MPTCP_SKB_CB(skb)->offset += count; 4369 + MPTCP_SKB_CB(skb)->map_seq += count; 4370 + break; 4371 + } 4372 + 4373 + mptcp_eat_recv_skb(sk, skb); 4374 + } 4375 + 4376 + if (noack) 4377 + goto out; 4378 + 4379 + mptcp_rcv_space_adjust(msk, copied); 4380 + 4381 + if (copied > 0) { 4382 + mptcp_recv_skb(sk, &offset); 4383 + mptcp_cleanup_rbuf(msk, copied); 4384 + } 4385 + out: 4386 + return copied; 4387 + } 4388 + 4389 + static int mptcp_read_sock(struct sock *sk, read_descriptor_t *desc, 4390 + sk_read_actor_t recv_actor) 4391 + { 4392 + return __mptcp_read_sock(sk, desc, recv_actor, false); 4393 + } 4394 + 4395 + static int __mptcp_splice_read(struct sock *sk, struct tcp_splice_state *tss) 4396 + { 4397 + /* Store TCP splice context information in read_descriptor_t. */ 4398 + read_descriptor_t rd_desc = { 4399 + .arg.data = tss, 4400 + .count = tss->len, 4401 + }; 4402 + 4403 + return mptcp_read_sock(sk, &rd_desc, tcp_splice_data_recv); 4404 + } 4405 + 4406 + /** 4407 + * mptcp_splice_read - splice data from MPTCP socket to a pipe 4408 + * @sock: socket to splice from 4409 + * @ppos: position (not valid) 4410 + * @pipe: pipe to splice to 4411 + * @len: number of bytes to splice 4412 + * @flags: splice modifier flags 4413 + * 4414 + * Description: 4415 + * Will read pages from given socket and fill them into a pipe. 4416 + * 4417 + * Return: 4418 + * Amount of bytes that have been spliced. 4419 + * 4420 + **/ 4421 + static ssize_t mptcp_splice_read(struct socket *sock, loff_t *ppos, 4422 + struct pipe_inode_info *pipe, size_t len, 4423 + unsigned int flags) 4424 + { 4425 + struct tcp_splice_state tss = { 4426 + .pipe = pipe, 4427 + .len = len, 4428 + .flags = flags, 4429 + }; 4430 + struct sock *sk = sock->sk; 4431 + ssize_t spliced = 0; 4432 + int ret = 0; 4433 + long timeo; 4434 + 4435 + /* 4436 + * We can't seek on a socket input 4437 + */ 4438 + if (unlikely(*ppos)) 4439 + return -ESPIPE; 4440 + 4441 + lock_sock(sk); 4442 + 4443 + mptcp_rps_record_subflows(mptcp_sk(sk)); 4444 + 4445 + timeo = sock_rcvtimeo(sk, sock->file->f_flags & O_NONBLOCK); 4446 + while (tss.len) { 4447 + ret = __mptcp_splice_read(sk, &tss); 4448 + if (ret < 0) { 4449 + break; 4450 + } else if (!ret) { 4451 + if (spliced) 4452 + break; 4453 + if (sock_flag(sk, SOCK_DONE)) 4454 + break; 4455 + if (sk->sk_err) { 4456 + ret = sock_error(sk); 4457 + break; 4458 + } 4459 + if (sk->sk_shutdown & RCV_SHUTDOWN) 4460 + break; 4461 + if (sk->sk_state == TCP_CLOSE) { 4462 + /* 4463 + * This occurs when user tries to read 4464 + * from never connected socket. 4465 + */ 4466 + ret = -ENOTCONN; 4467 + break; 4468 + } 4469 + if (!timeo) { 4470 + ret = -EAGAIN; 4471 + break; 4472 + } 4473 + /* if __mptcp_splice_read() got nothing while we have 4474 + * an skb in receive queue, we do not want to loop. 4475 + * This might happen with URG data. 4476 + */ 4477 + if (!skb_queue_empty(&sk->sk_receive_queue)) 4478 + break; 4479 + ret = sk_wait_data(sk, &timeo, NULL); 4480 + if (ret < 0) 4481 + break; 4482 + if (signal_pending(current)) { 4483 + ret = sock_intr_errno(timeo); 4484 + break; 4485 + } 4486 + continue; 4487 + } 4488 + tss.len -= ret; 4489 + spliced += ret; 4490 + 4491 + if (!tss.len || !timeo) 4492 + break; 4493 + release_sock(sk); 4494 + lock_sock(sk); 4495 + 4496 + if (sk->sk_err || sk->sk_state == TCP_CLOSE || 4497 + (sk->sk_shutdown & RCV_SHUTDOWN) || 4498 + signal_pending(current)) 4499 + break; 4500 + } 4501 + 4502 + release_sock(sk); 4503 + 4504 + if (spliced) 4505 + return spliced; 4506 + 4507 + return ret; 4508 + } 4509 + 4320 4510 static const struct proto_ops mptcp_stream_ops = { 4321 4511 .family = PF_INET, 4322 4512 .owner = THIS_MODULE, ··· 4532 4332 .recvmsg = inet_recvmsg, 4533 4333 .mmap = sock_no_mmap, 4534 4334 .set_rcvlowat = mptcp_set_rcvlowat, 4335 + .read_sock = mptcp_read_sock, 4336 + .splice_read = mptcp_splice_read, 4535 4337 }; 4536 4338 4537 4339 static struct inet_protosw mptcp_protosw = { ··· 4638 4436 .compat_ioctl = inet6_compat_ioctl, 4639 4437 #endif 4640 4438 .set_rcvlowat = mptcp_set_rcvlowat, 4439 + .read_sock = mptcp_read_sock, 4440 + .splice_read = mptcp_splice_read, 4641 4441 }; 4642 4442 4643 4443 static struct proto mptcp_v6_prot;
+1
tools/testing/selftests/net/mptcp/Makefile
··· 11 11 mptcp_connect_checksum.sh \ 12 12 mptcp_connect_mmap.sh \ 13 13 mptcp_connect_sendfile.sh \ 14 + mptcp_connect_splice.sh \ 14 15 mptcp_join.sh \ 15 16 mptcp_sockopt.sh \ 16 17 pm_netlink.sh \
+78 -1
tools/testing/selftests/net/mptcp/mptcp_connect.c
··· 52 52 CFG_MODE_POLL, 53 53 CFG_MODE_MMAP, 54 54 CFG_MODE_SENDFILE, 55 + CFG_MODE_SPLICE, 55 56 }; 56 57 57 58 enum cfg_peek { ··· 125 124 fprintf(stderr, "\t-j -- add additional sleep at connection start and tear down " 126 125 "-- for MPJ tests\n"); 127 126 fprintf(stderr, "\t-l -- listens mode, accepts incoming connection\n"); 128 - fprintf(stderr, "\t-m [poll|mmap|sendfile] -- use poll(default)/mmap+write/sendfile\n"); 127 + fprintf(stderr, "\t-m [poll|mmap|sendfile|splice] -- use poll(default)/mmap+write/sendfile/splice\n"); 129 128 fprintf(stderr, "\t-M mark -- set socket packet mark\n"); 130 129 fprintf(stderr, "\t-o option -- test sockopt <option>\n"); 131 130 fprintf(stderr, "\t-p num -- use port num\n"); ··· 936 935 return err; 937 936 } 938 937 938 + static int do_splice(const int infd, const int outfd, const size_t len, 939 + struct wstate *winfo) 940 + { 941 + ssize_t in_bytes, out_bytes; 942 + int pipefd[2]; 943 + int err; 944 + 945 + err = pipe(pipefd); 946 + if (err) { 947 + perror("pipe"); 948 + return 2; 949 + } 950 + 951 + again: 952 + in_bytes = splice(infd, NULL, pipefd[1], NULL, len - winfo->total_len, 953 + SPLICE_F_MOVE | SPLICE_F_MORE); 954 + if (in_bytes < 0) { 955 + perror("splice in"); 956 + err = 3; 957 + } else if (in_bytes > 0) { 958 + out_bytes = splice(pipefd[0], NULL, outfd, NULL, in_bytes, 959 + SPLICE_F_MOVE | SPLICE_F_MORE); 960 + if (out_bytes < 0) { 961 + perror("splice out"); 962 + err = 4; 963 + } else if (in_bytes != out_bytes) { 964 + fprintf(stderr, "Unexpected transfer: %zu vs %zu\n", 965 + in_bytes, out_bytes); 966 + err = 5; 967 + } else { 968 + goto again; 969 + } 970 + } 971 + 972 + close(pipefd[0]); 973 + close(pipefd[1]); 974 + 975 + return err; 976 + } 977 + 978 + static int copyfd_io_splice(int infd, int peerfd, int outfd, unsigned int size, 979 + bool *in_closed_after_out, struct wstate *winfo) 980 + { 981 + int err; 982 + 983 + if (listen_mode) { 984 + err = do_splice(peerfd, outfd, size, winfo); 985 + if (err) 986 + return err; 987 + 988 + err = do_splice(infd, peerfd, size, winfo); 989 + } else { 990 + err = do_splice(infd, peerfd, size, winfo); 991 + if (err) 992 + return err; 993 + 994 + shut_wr(peerfd); 995 + 996 + err = do_splice(peerfd, outfd, size, winfo); 997 + *in_closed_after_out = true; 998 + } 999 + 1000 + return err; 1001 + } 1002 + 939 1003 static int copyfd_io(int infd, int peerfd, int outfd, bool close_peerfd, struct wstate *winfo) 940 1004 { 941 1005 bool in_closed_after_out = false; ··· 1031 965 return file_size; 1032 966 ret = copyfd_io_sendfile(infd, peerfd, outfd, file_size, 1033 967 &in_closed_after_out, winfo); 968 + break; 969 + 970 + case CFG_MODE_SPLICE: 971 + file_size = get_infd_size(infd); 972 + if (file_size < 0) 973 + return file_size; 974 + ret = copyfd_io_splice(infd, peerfd, outfd, file_size, 975 + &in_closed_after_out, winfo); 1034 976 break; 1035 977 1036 978 default: ··· 1454 1380 return CFG_MODE_MMAP; 1455 1381 if (!strcasecmp(mode, "sendfile")) 1456 1382 return CFG_MODE_SENDFILE; 1383 + if (!strcasecmp(mode, "splice")) 1384 + return CFG_MODE_SPLICE; 1457 1385 1458 1386 fprintf(stderr, "Unknown test mode: %s\n", mode); 1459 1387 fprintf(stderr, "Supported modes are:\n"); 1460 1388 fprintf(stderr, "\t\t\"poll\" - interleaved read/write using poll()\n"); 1461 1389 fprintf(stderr, "\t\t\"mmap\" - send entire input file (mmap+write), then read response (-l will read input first)\n"); 1462 1390 fprintf(stderr, "\t\t\"sendfile\" - send entire input file (sendfile), then read response (-l will read input first)\n"); 1391 + fprintf(stderr, "\t\t\"splice\" - send entire input file (splice), then read response (-l will read input first)\n"); 1463 1392 1464 1393 die_usage(); 1465 1394
+5
tools/testing/selftests/net/mptcp/mptcp_connect_splice.sh
··· 1 + #!/bin/bash 2 + # SPDX-License-Identifier: GPL-2.0 3 + 4 + MPTCP_LIB_KSFT_TEST="$(basename "${0}" .sh)" \ 5 + "$(dirname "${0}")/mptcp_connect.sh" -m splice "${@}"