Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'mptcp-part-two'

Christoph Paasch says:

====================
Multipath TCP part 2: Single subflow & RFC8684 support

v2 -> v3: Added RFC8684-style handshake (see below fore more details) and some minor fixes
v1 -> v2: Rebased on latest "Multipath TCP: Prerequisites" v3 series

This set adds MPTCP connection establishment, writing & reading MPTCP
options on data packets, a sysctl to allow MPTCP per-namespace, and self
tests. This is sufficient to establish and maintain a connection with a
MPTCP peer, but will not yet allow or initiate establishment of
additional MPTCP subflows.

We also add the necessary code for the RFC8684-style handshake.
RFC8684 obsoletes the experimental RFC6824 and makes MPTCP move-on to
version 1.

Originally our plan was to submit single-subflow and RFC8684 support in
two patchsets, but to simplify the merging-process and ensure that a coherent
MPTCP-version lands in Linux we decided to merge the two sets into a single
one.

The MPTCP patchset exclusively supports RFC 8684. Although all MPTCP
deployments are currently based on RFC 6824, future deployments will be
migrating to MPTCP version 1. 3GPP's 5G standardization also solely supports
RFC 8684. In addition, we believe that this initial submission of MPTCP will be
cleaner by solely supporting RFC 8684. If later on support for the old
MPTCP-version is required it can always be added in the future.

The major difference between RFC 8684 and RFC 6824 is that it has a better
support for servers using TCP SYN-cookies by reliably retransmitting the
MP_CAPABLE option.

Before ending this cover letter with some refs, it is worth mentioning
that we promise David Miller that merging this series will be rewarded by
Twitter dopamine hits :-D

Clone/fetch:
https://github.com/multipath-tcp/mptcp_net-next.git (tag: netdev-v3-part2)

Browse:
https://github.com/multipath-tcp/mptcp_net-next/tree/netdev-v3-part2

Thank you for your review. You can find us at mptcp@lists.01.org and
https://is.gd/mptcp_upstream
====================

Signed-off-by: David S. Miller <davem@davemloft.net>

+5118 -2
+2
MAINTAINERS
··· 11583 11583 B: https://github.com/multipath-tcp/mptcp_net-next/issues 11584 11584 S: Maintained 11585 11585 F: include/net/mptcp.h 11586 + F: net/mptcp/ 11587 + F: tools/testing/selftests/net/mptcp/ 11586 11588 11587 11589 NETWORKING [TCP] 11588 11590 M: Eric Dumazet <edumazet@google.com>
+35
include/linux/tcp.h
··· 78 78 #define TCP_SACK_SEEN (1 << 0) /*1 = peer is SACK capable, */ 79 79 #define TCP_DSACK_SEEN (1 << 2) /*1 = DSACK was received from peer*/ 80 80 81 + #if IS_ENABLED(CONFIG_MPTCP) 82 + struct mptcp_options_received { 83 + u64 sndr_key; 84 + u64 rcvr_key; 85 + u64 data_ack; 86 + u64 data_seq; 87 + u32 subflow_seq; 88 + u16 data_len; 89 + u8 mp_capable : 1, 90 + mp_join : 1, 91 + dss : 1; 92 + u8 use_map:1, 93 + dsn64:1, 94 + data_fin:1, 95 + use_ack:1, 96 + ack64:1, 97 + mpc_map:1, 98 + __unused:2; 99 + }; 100 + #endif 101 + 81 102 struct tcp_options_received { 82 103 /* PAWS/RTTM data */ 83 104 int ts_recent_stamp;/* Time we stored ts_recent (for aging) */ ··· 116 95 u8 num_sacks; /* Number of SACK blocks */ 117 96 u16 user_mss; /* mss requested by user in ioctl */ 118 97 u16 mss_clamp; /* Maximal mss, negotiated at connection setup */ 98 + #if IS_ENABLED(CONFIG_MPTCP) 99 + struct mptcp_options_received mptcp; 100 + #endif 119 101 }; 120 102 121 103 static inline void tcp_clear_options(struct tcp_options_received *rx_opt) ··· 127 103 rx_opt->wscale_ok = rx_opt->snd_wscale = 0; 128 104 #if IS_ENABLED(CONFIG_SMC) 129 105 rx_opt->smc_ok = 0; 106 + #endif 107 + #if IS_ENABLED(CONFIG_MPTCP) 108 + rx_opt->mptcp.mp_capable = 0; 109 + rx_opt->mptcp.mp_join = 0; 110 + rx_opt->mptcp.dss = 0; 130 111 #endif 131 112 } 132 113 ··· 148 119 const struct tcp_request_sock_ops *af_specific; 149 120 u64 snt_synack; /* first SYNACK sent time */ 150 121 bool tfo_listener; 122 + #if IS_ENABLED(CONFIG_MPTCP) 123 + bool is_mptcp; 124 + #endif 151 125 u32 txhash; 152 126 u32 rcv_isn; 153 127 u32 snt_isn; ··· 411 379 u32 mtu_info; /* We received an ICMP_FRAG_NEEDED / ICMPV6_PKT_TOOBIG 412 380 * while socket was owned by user. 413 381 */ 382 + #if IS_ENABLED(CONFIG_MPTCP) 383 + bool is_mptcp; 384 + #endif 414 385 415 386 #ifdef CONFIG_TCP_MD5SIG 416 387 /* TCP AF-Specific parts; only used by MD5 Signature support so far */
+104 -1
include/net/mptcp.h
··· 9 9 #define __NET_MPTCP_H 10 10 11 11 #include <linux/skbuff.h> 12 + #include <linux/tcp.h> 12 13 #include <linux/types.h> 13 14 14 15 /* MPTCP sk_buff extension data */ ··· 23 22 data_fin:1, 24 23 use_ack:1, 25 24 ack64:1, 26 - __unused:3; 25 + mpc_map:1, 26 + __unused:2; 27 27 /* one byte hole */ 28 28 }; 29 29 30 + struct mptcp_out_options { 31 + #if IS_ENABLED(CONFIG_MPTCP) 32 + u16 suboptions; 33 + u64 sndr_key; 34 + u64 rcvr_key; 35 + struct mptcp_ext ext_copy; 36 + #endif 37 + }; 38 + 30 39 #ifdef CONFIG_MPTCP 40 + 41 + void mptcp_init(void); 42 + 43 + static inline bool sk_is_mptcp(const struct sock *sk) 44 + { 45 + return tcp_sk(sk)->is_mptcp; 46 + } 47 + 48 + static inline bool rsk_is_mptcp(const struct request_sock *req) 49 + { 50 + return tcp_rsk(req)->is_mptcp; 51 + } 52 + 53 + void mptcp_parse_option(const struct sk_buff *skb, const unsigned char *ptr, 54 + int opsize, struct tcp_options_received *opt_rx); 55 + bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb, 56 + unsigned int *size, struct mptcp_out_options *opts); 57 + void mptcp_rcv_synsent(struct sock *sk); 58 + bool mptcp_synack_options(const struct request_sock *req, unsigned int *size, 59 + struct mptcp_out_options *opts); 60 + bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, 61 + unsigned int *size, unsigned int remaining, 62 + struct mptcp_out_options *opts); 63 + void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb, 64 + struct tcp_options_received *opt_rx); 65 + 66 + void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts); 31 67 32 68 /* move the skb extension owership, with the assumption that 'to' is 33 69 * newly allocated ··· 108 70 109 71 #else 110 72 73 + static inline void mptcp_init(void) 74 + { 75 + } 76 + 77 + static inline bool sk_is_mptcp(const struct sock *sk) 78 + { 79 + return false; 80 + } 81 + 82 + static inline bool rsk_is_mptcp(const struct request_sock *req) 83 + { 84 + return false; 85 + } 86 + 87 + static inline void mptcp_parse_option(const struct sk_buff *skb, 88 + const unsigned char *ptr, int opsize, 89 + struct tcp_options_received *opt_rx) 90 + { 91 + } 92 + 93 + static inline bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb, 94 + unsigned int *size, 95 + struct mptcp_out_options *opts) 96 + { 97 + return false; 98 + } 99 + 100 + static inline void mptcp_rcv_synsent(struct sock *sk) 101 + { 102 + } 103 + 104 + static inline bool mptcp_synack_options(const struct request_sock *req, 105 + unsigned int *size, 106 + struct mptcp_out_options *opts) 107 + { 108 + return false; 109 + } 110 + 111 + static inline bool mptcp_established_options(struct sock *sk, 112 + struct sk_buff *skb, 113 + unsigned int *size, 114 + unsigned int remaining, 115 + struct mptcp_out_options *opts) 116 + { 117 + return false; 118 + } 119 + 120 + static inline void mptcp_incoming_options(struct sock *sk, 121 + struct sk_buff *skb, 122 + struct tcp_options_received *opt_rx) 123 + { 124 + } 125 + 111 126 static inline void mptcp_skb_ext_move(struct sk_buff *to, 112 127 const struct sk_buff *from) 113 128 { ··· 173 82 } 174 83 175 84 #endif /* CONFIG_MPTCP */ 85 + 86 + void mptcp_handle_ipv6_mapped(struct sock *sk, bool mapped); 87 + 88 + #if IS_ENABLED(CONFIG_MPTCP_IPV6) 89 + int mptcpv6_init(void); 90 + #elif IS_ENABLED(CONFIG_IPV6) 91 + static inline int mptcpv6_init(void) 92 + { 93 + return 0; 94 + } 95 + #endif 96 + 176 97 #endif /* __NET_MPTCP_H */
+1
net/Kconfig
··· 91 91 source "net/ipv4/Kconfig" 92 92 source "net/ipv6/Kconfig" 93 93 source "net/netlabel/Kconfig" 94 + source "net/mptcp/Kconfig" 94 95 95 96 endif # if INET 96 97
+1
net/Makefile
··· 87 87 obj-$(CONFIG_QRTR) += qrtr/ 88 88 obj-$(CONFIG_NET_NCSI) += ncsi/ 89 89 obj-$(CONFIG_XDP_SOCKETS) += xdp/ 90 + obj-$(CONFIG_MPTCP) += mptcp/
+2
net/ipv4/tcp.c
··· 271 271 #include <net/icmp.h> 272 272 #include <net/inet_common.h> 273 273 #include <net/tcp.h> 274 + #include <net/mptcp.h> 274 275 #include <net/xfrm.h> 275 276 #include <net/ip.h> 276 277 #include <net/sock.h> ··· 4022 4021 tcp_metrics_init(); 4023 4022 BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0); 4024 4023 tcp_tasklet_init(); 4024 + mptcp_init(); 4025 4025 }
+18 -1
net/ipv4/tcp_input.c
··· 79 79 #include <trace/events/tcp.h> 80 80 #include <linux/jump_label_ratelimit.h> 81 81 #include <net/busy_poll.h> 82 + #include <net/mptcp.h> 82 83 83 84 int sysctl_tcp_max_orphans __read_mostly = NR_FILE; 84 85 ··· 3925 3924 */ 3926 3925 break; 3927 3926 #endif 3927 + case TCPOPT_MPTCP: 3928 + mptcp_parse_option(skb, ptr, opsize, opt_rx); 3929 + break; 3930 + 3928 3931 case TCPOPT_FASTOPEN: 3929 3932 tcp_parse_fastopen_option( 3930 3933 opsize - TCPOLEN_FASTOPEN_BASE, ··· 4769 4764 struct tcp_sock *tp = tcp_sk(sk); 4770 4765 bool fragstolen; 4771 4766 int eaten; 4767 + 4768 + if (sk_is_mptcp(sk)) 4769 + mptcp_incoming_options(sk, skb, &tp->rx_opt); 4772 4770 4773 4771 if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) { 4774 4772 __kfree_skb(skb); ··· 5981 5973 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); 5982 5974 tcp_initialize_rcv_mss(sk); 5983 5975 5976 + if (sk_is_mptcp(sk)) 5977 + mptcp_rcv_synsent(sk); 5978 + 5984 5979 /* Remember, tcp_poll() does not lock socket! 5985 5980 * Change state from SYN-SENT only after copied_seq 5986 5981 * is initialized. */ ··· 6349 6338 case TCP_CLOSE_WAIT: 6350 6339 case TCP_CLOSING: 6351 6340 case TCP_LAST_ACK: 6352 - if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) 6341 + if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { 6342 + if (sk_is_mptcp(sk)) 6343 + mptcp_incoming_options(sk, skb, &tp->rx_opt); 6353 6344 break; 6345 + } 6354 6346 /* fall through */ 6355 6347 case TCP_FIN_WAIT1: 6356 6348 case TCP_FIN_WAIT2: ··· 6609 6595 6610 6596 tcp_rsk(req)->af_specific = af_ops; 6611 6597 tcp_rsk(req)->ts_off = 0; 6598 + #if IS_ENABLED(CONFIG_MPTCP) 6599 + tcp_rsk(req)->is_mptcp = 0; 6600 + #endif 6612 6601 6613 6602 tcp_clear_options(&tmp_opt); 6614 6603 tmp_opt.mss_clamp = af_ops->mss_clamp;
+57
net/ipv4/tcp_output.c
··· 38 38 #define pr_fmt(fmt) "TCP: " fmt 39 39 40 40 #include <net/tcp.h> 41 + #include <net/mptcp.h> 41 42 42 43 #include <linux/compiler.h> 43 44 #include <linux/gfp.h> ··· 415 414 #define OPTION_WSCALE (1 << 3) 416 415 #define OPTION_FAST_OPEN_COOKIE (1 << 8) 417 416 #define OPTION_SMC (1 << 9) 417 + #define OPTION_MPTCP (1 << 10) 418 418 419 419 static void smc_options_write(__be32 *ptr, u16 *options) 420 420 { ··· 441 439 __u8 *hash_location; /* temporary pointer, overloaded */ 442 440 __u32 tsval, tsecr; /* need to include OPTION_TS */ 443 441 struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */ 442 + struct mptcp_out_options mptcp; 444 443 }; 444 + 445 + static void mptcp_options_write(__be32 *ptr, struct tcp_out_options *opts) 446 + { 447 + #if IS_ENABLED(CONFIG_MPTCP) 448 + if (unlikely(OPTION_MPTCP & opts->options)) 449 + mptcp_write_options(ptr, &opts->mptcp); 450 + #endif 451 + } 445 452 446 453 /* Write previously computed TCP options to the packet. 447 454 * ··· 560 549 } 561 550 562 551 smc_options_write(ptr, &options); 552 + 553 + mptcp_options_write(ptr, opts); 563 554 } 564 555 565 556 static void smc_set_option(const struct tcp_sock *tp, ··· 595 582 } 596 583 } 597 584 #endif 585 + } 586 + 587 + static void mptcp_set_option_cond(const struct request_sock *req, 588 + struct tcp_out_options *opts, 589 + unsigned int *remaining) 590 + { 591 + if (rsk_is_mptcp(req)) { 592 + unsigned int size; 593 + 594 + if (mptcp_synack_options(req, &size, &opts->mptcp)) { 595 + if (*remaining >= size) { 596 + opts->options |= OPTION_MPTCP; 597 + *remaining -= size; 598 + } 599 + } 600 + } 598 601 } 599 602 600 603 /* Compute TCP options for SYN packets. This is not the final ··· 682 653 683 654 smc_set_option(tp, opts, &remaining); 684 655 656 + if (sk_is_mptcp(sk)) { 657 + unsigned int size; 658 + 659 + if (mptcp_syn_options(sk, skb, &size, &opts->mptcp)) { 660 + opts->options |= OPTION_MPTCP; 661 + remaining -= size; 662 + } 663 + } 664 + 685 665 return MAX_TCP_OPTION_SPACE - remaining; 686 666 } 687 667 ··· 752 714 } 753 715 } 754 716 717 + mptcp_set_option_cond(req, opts, &remaining); 718 + 755 719 smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining); 756 720 757 721 return MAX_TCP_OPTION_SPACE - remaining; ··· 789 749 opts->tsval = skb ? tcp_skb_timestamp(skb) + tp->tsoffset : 0; 790 750 opts->tsecr = tp->rx_opt.ts_recent; 791 751 size += TCPOLEN_TSTAMP_ALIGNED; 752 + } 753 + 754 + /* MPTCP options have precedence over SACK for the limited TCP 755 + * option space because a MPTCP connection would be forced to 756 + * fall back to regular TCP if a required multipath option is 757 + * missing. SACK still gets a chance to use whatever space is 758 + * left. 759 + */ 760 + if (sk_is_mptcp(sk)) { 761 + unsigned int remaining = MAX_TCP_OPTION_SPACE - size; 762 + unsigned int opt_size = 0; 763 + 764 + if (mptcp_established_options(sk, skb, &opt_size, remaining, 765 + &opts->mptcp)) { 766 + opts->options |= OPTION_MPTCP; 767 + size += opt_size; 768 + } 792 769 } 793 770 794 771 eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
+13
net/ipv6/tcp_ipv6.c
··· 238 238 sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3]; 239 239 240 240 icsk->icsk_af_ops = &ipv6_mapped; 241 + if (sk_is_mptcp(sk)) 242 + mptcp_handle_ipv6_mapped(sk, true); 241 243 sk->sk_backlog_rcv = tcp_v4_do_rcv; 242 244 #ifdef CONFIG_TCP_MD5SIG 243 245 tp->af_specific = &tcp_sock_ipv6_mapped_specific; ··· 250 248 if (err) { 251 249 icsk->icsk_ext_hdr_len = exthdrlen; 252 250 icsk->icsk_af_ops = &ipv6_specific; 251 + if (sk_is_mptcp(sk)) 252 + mptcp_handle_ipv6_mapped(sk, false); 253 253 sk->sk_backlog_rcv = tcp_v6_do_rcv; 254 254 #ifdef CONFIG_TCP_MD5SIG 255 255 tp->af_specific = &tcp_sock_ipv6_specific; ··· 1207 1203 newnp->saddr = newsk->sk_v6_rcv_saddr; 1208 1204 1209 1205 inet_csk(newsk)->icsk_af_ops = &ipv6_mapped; 1206 + if (sk_is_mptcp(newsk)) 1207 + mptcp_handle_ipv6_mapped(newsk, true); 1210 1208 newsk->sk_backlog_rcv = tcp_v4_do_rcv; 1211 1209 #ifdef CONFIG_TCP_MD5SIG 1212 1210 newtp->af_specific = &tcp_sock_ipv6_mapped_specific; ··· 2169 2163 ret = register_pernet_subsys(&tcpv6_net_ops); 2170 2164 if (ret) 2171 2165 goto out_tcpv6_protosw; 2166 + 2167 + ret = mptcpv6_init(); 2168 + if (ret) 2169 + goto out_tcpv6_pernet_subsys; 2170 + 2172 2171 out: 2173 2172 return ret; 2174 2173 2174 + out_tcpv6_pernet_subsys: 2175 + unregister_pernet_subsys(&tcpv6_net_ops); 2175 2176 out_tcpv6_protosw: 2176 2177 inet6_unregister_protosw(&tcpv6_protosw); 2177 2178 out_tcpv6_protocol:
+26
net/mptcp/Kconfig
··· 1 + 2 + config MPTCP 3 + bool "MPTCP: Multipath TCP" 4 + depends on INET 5 + select SKB_EXTENSIONS 6 + select CRYPTO_LIB_SHA256 7 + help 8 + Multipath TCP (MPTCP) connections send and receive data over multiple 9 + subflows in order to utilize multiple network paths. Each subflow 10 + uses the TCP protocol, and TCP options carry header information for 11 + MPTCP. 12 + 13 + config MPTCP_IPV6 14 + bool "MPTCP: IPv6 support for Multipath TCP" 15 + depends on MPTCP 16 + select IPV6 17 + default y 18 + 19 + config MPTCP_HMAC_TEST 20 + bool "Tests for MPTCP HMAC implementation" 21 + default n 22 + help 23 + This option enable boot time self-test for the HMAC implementation 24 + used by the MPTCP code 25 + 26 + Say N if you are unsure.
+4
net/mptcp/Makefile
··· 1 + # SPDX-License-Identifier: GPL-2.0 2 + obj-$(CONFIG_MPTCP) += mptcp.o 3 + 4 + mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o
+152
net/mptcp/crypto.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Multipath TCP cryptographic functions 3 + * Copyright (c) 2017 - 2019, Intel Corporation. 4 + * 5 + * Note: This code is based on mptcp_ctrl.c, mptcp_ipv4.c, and 6 + * mptcp_ipv6 from multipath-tcp.org, authored by: 7 + * 8 + * Sébastien Barré <sebastien.barre@uclouvain.be> 9 + * Christoph Paasch <christoph.paasch@uclouvain.be> 10 + * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi> 11 + * Gregory Detal <gregory.detal@uclouvain.be> 12 + * Fabien Duchêne <fabien.duchene@uclouvain.be> 13 + * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de> 14 + * Lavkesh Lahngir <lavkesh51@gmail.com> 15 + * Andreas Ripke <ripke@neclab.eu> 16 + * Vlad Dogaru <vlad.dogaru@intel.com> 17 + * Octavian Purdila <octavian.purdila@intel.com> 18 + * John Ronan <jronan@tssg.org> 19 + * Catalin Nicutar <catalin.nicutar@gmail.com> 20 + * Brandon Heller <brandonh@stanford.edu> 21 + */ 22 + 23 + #include <linux/kernel.h> 24 + #include <crypto/sha.h> 25 + #include <asm/unaligned.h> 26 + 27 + #include "protocol.h" 28 + 29 + #define SHA256_DIGEST_WORDS (SHA256_DIGEST_SIZE / 4) 30 + 31 + void mptcp_crypto_key_sha(u64 key, u32 *token, u64 *idsn) 32 + { 33 + __be32 mptcp_hashed_key[SHA256_DIGEST_WORDS]; 34 + __be64 input = cpu_to_be64(key); 35 + struct sha256_state state; 36 + 37 + sha256_init(&state); 38 + sha256_update(&state, (__force u8 *)&input, sizeof(input)); 39 + sha256_final(&state, (u8 *)mptcp_hashed_key); 40 + 41 + if (token) 42 + *token = be32_to_cpu(mptcp_hashed_key[0]); 43 + if (idsn) 44 + *idsn = be64_to_cpu(*((__be64 *)&mptcp_hashed_key[6])); 45 + } 46 + 47 + void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u32 nonce1, u32 nonce2, 48 + void *hmac) 49 + { 50 + u8 input[SHA256_BLOCK_SIZE + SHA256_DIGEST_SIZE]; 51 + __be32 mptcp_hashed_key[SHA256_DIGEST_WORDS]; 52 + __be32 *hash_out = (__force __be32 *)hmac; 53 + struct sha256_state state; 54 + u8 key1be[8]; 55 + u8 key2be[8]; 56 + int i; 57 + 58 + put_unaligned_be64(key1, key1be); 59 + put_unaligned_be64(key2, key2be); 60 + 61 + /* Generate key xored with ipad */ 62 + memset(input, 0x36, SHA_MESSAGE_BYTES); 63 + for (i = 0; i < 8; i++) 64 + input[i] ^= key1be[i]; 65 + for (i = 0; i < 8; i++) 66 + input[i + 8] ^= key2be[i]; 67 + 68 + put_unaligned_be32(nonce1, &input[SHA256_BLOCK_SIZE]); 69 + put_unaligned_be32(nonce2, &input[SHA256_BLOCK_SIZE + 4]); 70 + 71 + sha256_init(&state); 72 + sha256_update(&state, input, SHA256_BLOCK_SIZE + 8); 73 + 74 + /* emit sha256(K1 || msg) on the second input block, so we can 75 + * reuse 'input' for the last hashing 76 + */ 77 + sha256_final(&state, &input[SHA256_BLOCK_SIZE]); 78 + 79 + /* Prepare second part of hmac */ 80 + memset(input, 0x5C, SHA_MESSAGE_BYTES); 81 + for (i = 0; i < 8; i++) 82 + input[i] ^= key1be[i]; 83 + for (i = 0; i < 8; i++) 84 + input[i + 8] ^= key2be[i]; 85 + 86 + sha256_init(&state); 87 + sha256_update(&state, input, SHA256_BLOCK_SIZE + SHA256_DIGEST_SIZE); 88 + sha256_final(&state, (u8 *)mptcp_hashed_key); 89 + 90 + /* takes only first 160 bits */ 91 + for (i = 0; i < 5; i++) 92 + hash_out[i] = mptcp_hashed_key[i]; 93 + } 94 + 95 + #ifdef CONFIG_MPTCP_HMAC_TEST 96 + struct test_cast { 97 + char *key; 98 + char *msg; 99 + char *result; 100 + }; 101 + 102 + /* we can't reuse RFC 4231 test vectors, as we have constraint on the 103 + * input and key size, and we truncate the output. 104 + */ 105 + static struct test_cast tests[] = { 106 + { 107 + .key = "0b0b0b0b0b0b0b0b", 108 + .msg = "48692054", 109 + .result = "8385e24fb4235ac37556b6b886db106284a1da67", 110 + }, 111 + { 112 + .key = "aaaaaaaaaaaaaaaa", 113 + .msg = "dddddddd", 114 + .result = "2c5e219164ff1dca1c4a92318d847bb6b9d44492", 115 + }, 116 + { 117 + .key = "0102030405060708", 118 + .msg = "cdcdcdcd", 119 + .result = "e73b9ba9969969cefb04aa0d6df18ec2fcc075b6", 120 + }, 121 + }; 122 + 123 + static int __init test_mptcp_crypto(void) 124 + { 125 + char hmac[20], hmac_hex[41]; 126 + u32 nonce1, nonce2; 127 + u64 key1, key2; 128 + int i, j; 129 + 130 + for (i = 0; i < ARRAY_SIZE(tests); ++i) { 131 + /* mptcp hmap will convert to be before computing the hmac */ 132 + key1 = be64_to_cpu(*((__be64 *)&tests[i].key[0])); 133 + key2 = be64_to_cpu(*((__be64 *)&tests[i].key[8])); 134 + nonce1 = be32_to_cpu(*((__be32 *)&tests[i].msg[0])); 135 + nonce2 = be32_to_cpu(*((__be32 *)&tests[i].msg[4])); 136 + 137 + mptcp_crypto_hmac_sha(key1, key2, nonce1, nonce2, hmac); 138 + for (j = 0; j < 20; ++j) 139 + sprintf(&hmac_hex[j << 1], "%02x", hmac[j] & 0xff); 140 + hmac_hex[40] = 0; 141 + 142 + if (memcmp(hmac_hex, tests[i].result, 40)) 143 + pr_err("test %d failed, got %s expected %s", i, 144 + hmac_hex, tests[i].result); 145 + else 146 + pr_info("test %d [ ok ]", i); 147 + } 148 + return 0; 149 + } 150 + 151 + late_initcall(test_mptcp_crypto); 152 + #endif
+130
net/mptcp/ctrl.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Multipath TCP 3 + * 4 + * Copyright (c) 2019, Tessares SA. 5 + */ 6 + 7 + #include <linux/sysctl.h> 8 + 9 + #include <net/net_namespace.h> 10 + #include <net/netns/generic.h> 11 + 12 + #include "protocol.h" 13 + 14 + #define MPTCP_SYSCTL_PATH "net/mptcp" 15 + 16 + static int mptcp_pernet_id; 17 + struct mptcp_pernet { 18 + struct ctl_table_header *ctl_table_hdr; 19 + 20 + int mptcp_enabled; 21 + }; 22 + 23 + static struct mptcp_pernet *mptcp_get_pernet(struct net *net) 24 + { 25 + return net_generic(net, mptcp_pernet_id); 26 + } 27 + 28 + int mptcp_is_enabled(struct net *net) 29 + { 30 + return mptcp_get_pernet(net)->mptcp_enabled; 31 + } 32 + 33 + static struct ctl_table mptcp_sysctl_table[] = { 34 + { 35 + .procname = "enabled", 36 + .maxlen = sizeof(int), 37 + .mode = 0644, 38 + /* users with CAP_NET_ADMIN or root (not and) can change this 39 + * value, same as other sysctl or the 'net' tree. 40 + */ 41 + .proc_handler = proc_dointvec, 42 + }, 43 + {} 44 + }; 45 + 46 + static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet) 47 + { 48 + pernet->mptcp_enabled = 1; 49 + } 50 + 51 + static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet) 52 + { 53 + struct ctl_table_header *hdr; 54 + struct ctl_table *table; 55 + 56 + table = mptcp_sysctl_table; 57 + if (!net_eq(net, &init_net)) { 58 + table = kmemdup(table, sizeof(mptcp_sysctl_table), GFP_KERNEL); 59 + if (!table) 60 + goto err_alloc; 61 + } 62 + 63 + table[0].data = &pernet->mptcp_enabled; 64 + 65 + hdr = register_net_sysctl(net, MPTCP_SYSCTL_PATH, table); 66 + if (!hdr) 67 + goto err_reg; 68 + 69 + pernet->ctl_table_hdr = hdr; 70 + 71 + return 0; 72 + 73 + err_reg: 74 + if (!net_eq(net, &init_net)) 75 + kfree(table); 76 + err_alloc: 77 + return -ENOMEM; 78 + } 79 + 80 + static void mptcp_pernet_del_table(struct mptcp_pernet *pernet) 81 + { 82 + struct ctl_table *table = pernet->ctl_table_hdr->ctl_table_arg; 83 + 84 + unregister_net_sysctl_table(pernet->ctl_table_hdr); 85 + 86 + kfree(table); 87 + } 88 + 89 + static int __net_init mptcp_net_init(struct net *net) 90 + { 91 + struct mptcp_pernet *pernet = mptcp_get_pernet(net); 92 + 93 + mptcp_pernet_set_defaults(pernet); 94 + 95 + return mptcp_pernet_new_table(net, pernet); 96 + } 97 + 98 + /* Note: the callback will only be called per extra netns */ 99 + static void __net_exit mptcp_net_exit(struct net *net) 100 + { 101 + struct mptcp_pernet *pernet = mptcp_get_pernet(net); 102 + 103 + mptcp_pernet_del_table(pernet); 104 + } 105 + 106 + static struct pernet_operations mptcp_pernet_ops = { 107 + .init = mptcp_net_init, 108 + .exit = mptcp_net_exit, 109 + .id = &mptcp_pernet_id, 110 + .size = sizeof(struct mptcp_pernet), 111 + }; 112 + 113 + void __init mptcp_init(void) 114 + { 115 + mptcp_proto_init(); 116 + 117 + if (register_pernet_subsys(&mptcp_pernet_ops) < 0) 118 + panic("Failed to register MPTCP pernet subsystem.\n"); 119 + } 120 + 121 + #if IS_ENABLED(CONFIG_MPTCP_IPV6) 122 + int __init mptcpv6_init(void) 123 + { 124 + int err; 125 + 126 + err = mptcp_proto_v6_init(); 127 + 128 + return err; 129 + } 130 + #endif
+586
net/mptcp/options.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Multipath TCP 3 + * 4 + * Copyright (c) 2017 - 2019, Intel Corporation. 5 + */ 6 + 7 + #include <linux/kernel.h> 8 + #include <net/tcp.h> 9 + #include <net/mptcp.h> 10 + #include "protocol.h" 11 + 12 + static bool mptcp_cap_flag_sha256(u8 flags) 13 + { 14 + return (flags & MPTCP_CAP_FLAG_MASK) == MPTCP_CAP_HMAC_SHA256; 15 + } 16 + 17 + void mptcp_parse_option(const struct sk_buff *skb, const unsigned char *ptr, 18 + int opsize, struct tcp_options_received *opt_rx) 19 + { 20 + struct mptcp_options_received *mp_opt = &opt_rx->mptcp; 21 + u8 subtype = *ptr >> 4; 22 + int expected_opsize; 23 + u8 version; 24 + u8 flags; 25 + 26 + switch (subtype) { 27 + case MPTCPOPT_MP_CAPABLE: 28 + /* strict size checking */ 29 + if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { 30 + if (skb->len > tcp_hdr(skb)->doff << 2) 31 + expected_opsize = TCPOLEN_MPTCP_MPC_ACK_DATA; 32 + else 33 + expected_opsize = TCPOLEN_MPTCP_MPC_ACK; 34 + } else { 35 + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK) 36 + expected_opsize = TCPOLEN_MPTCP_MPC_SYNACK; 37 + else 38 + expected_opsize = TCPOLEN_MPTCP_MPC_SYN; 39 + } 40 + if (opsize != expected_opsize) 41 + break; 42 + 43 + /* try to be gentle vs future versions on the initial syn */ 44 + version = *ptr++ & MPTCP_VERSION_MASK; 45 + if (opsize != TCPOLEN_MPTCP_MPC_SYN) { 46 + if (version != MPTCP_SUPPORTED_VERSION) 47 + break; 48 + } else if (version < MPTCP_SUPPORTED_VERSION) { 49 + break; 50 + } 51 + 52 + flags = *ptr++; 53 + if (!mptcp_cap_flag_sha256(flags) || 54 + (flags & MPTCP_CAP_EXTENSIBILITY)) 55 + break; 56 + 57 + /* RFC 6824, Section 3.1: 58 + * "For the Checksum Required bit (labeled "A"), if either 59 + * host requires the use of checksums, checksums MUST be used. 60 + * In other words, the only way for checksums not to be used 61 + * is if both hosts in their SYNs set A=0." 62 + * 63 + * Section 3.3.0: 64 + * "If a checksum is not present when its use has been 65 + * negotiated, the receiver MUST close the subflow with a RST as 66 + * it is considered broken." 67 + * 68 + * We don't implement DSS checksum - fall back to TCP. 69 + */ 70 + if (flags & MPTCP_CAP_CHECKSUM_REQD) 71 + break; 72 + 73 + mp_opt->mp_capable = 1; 74 + if (opsize >= TCPOLEN_MPTCP_MPC_SYNACK) { 75 + mp_opt->sndr_key = get_unaligned_be64(ptr); 76 + ptr += 8; 77 + } 78 + if (opsize >= TCPOLEN_MPTCP_MPC_ACK) { 79 + mp_opt->rcvr_key = get_unaligned_be64(ptr); 80 + ptr += 8; 81 + } 82 + if (opsize == TCPOLEN_MPTCP_MPC_ACK_DATA) { 83 + /* Section 3.1.: 84 + * "the data parameters in a MP_CAPABLE are semantically 85 + * equivalent to those in a DSS option and can be used 86 + * interchangeably." 87 + */ 88 + mp_opt->dss = 1; 89 + mp_opt->use_map = 1; 90 + mp_opt->mpc_map = 1; 91 + mp_opt->data_len = get_unaligned_be16(ptr); 92 + ptr += 2; 93 + } 94 + pr_debug("MP_CAPABLE version=%x, flags=%x, optlen=%d sndr=%llu, rcvr=%llu len=%d", 95 + version, flags, opsize, mp_opt->sndr_key, 96 + mp_opt->rcvr_key, mp_opt->data_len); 97 + break; 98 + 99 + case MPTCPOPT_DSS: 100 + pr_debug("DSS"); 101 + ptr++; 102 + 103 + /* we must clear 'mpc_map' be able to detect MP_CAPABLE 104 + * map vs DSS map in mptcp_incoming_options(), and reconstruct 105 + * map info accordingly 106 + */ 107 + mp_opt->mpc_map = 0; 108 + flags = (*ptr++) & MPTCP_DSS_FLAG_MASK; 109 + mp_opt->data_fin = (flags & MPTCP_DSS_DATA_FIN) != 0; 110 + mp_opt->dsn64 = (flags & MPTCP_DSS_DSN64) != 0; 111 + mp_opt->use_map = (flags & MPTCP_DSS_HAS_MAP) != 0; 112 + mp_opt->ack64 = (flags & MPTCP_DSS_ACK64) != 0; 113 + mp_opt->use_ack = (flags & MPTCP_DSS_HAS_ACK); 114 + 115 + pr_debug("data_fin=%d dsn64=%d use_map=%d ack64=%d use_ack=%d", 116 + mp_opt->data_fin, mp_opt->dsn64, 117 + mp_opt->use_map, mp_opt->ack64, 118 + mp_opt->use_ack); 119 + 120 + expected_opsize = TCPOLEN_MPTCP_DSS_BASE; 121 + 122 + if (mp_opt->use_ack) { 123 + if (mp_opt->ack64) 124 + expected_opsize += TCPOLEN_MPTCP_DSS_ACK64; 125 + else 126 + expected_opsize += TCPOLEN_MPTCP_DSS_ACK32; 127 + } 128 + 129 + if (mp_opt->use_map) { 130 + if (mp_opt->dsn64) 131 + expected_opsize += TCPOLEN_MPTCP_DSS_MAP64; 132 + else 133 + expected_opsize += TCPOLEN_MPTCP_DSS_MAP32; 134 + } 135 + 136 + /* RFC 6824, Section 3.3: 137 + * If a checksum is present, but its use had 138 + * not been negotiated in the MP_CAPABLE handshake, 139 + * the checksum field MUST be ignored. 140 + */ 141 + if (opsize != expected_opsize && 142 + opsize != expected_opsize + TCPOLEN_MPTCP_DSS_CHECKSUM) 143 + break; 144 + 145 + mp_opt->dss = 1; 146 + 147 + if (mp_opt->use_ack) { 148 + if (mp_opt->ack64) { 149 + mp_opt->data_ack = get_unaligned_be64(ptr); 150 + ptr += 8; 151 + } else { 152 + mp_opt->data_ack = get_unaligned_be32(ptr); 153 + ptr += 4; 154 + } 155 + 156 + pr_debug("data_ack=%llu", mp_opt->data_ack); 157 + } 158 + 159 + if (mp_opt->use_map) { 160 + if (mp_opt->dsn64) { 161 + mp_opt->data_seq = get_unaligned_be64(ptr); 162 + ptr += 8; 163 + } else { 164 + mp_opt->data_seq = get_unaligned_be32(ptr); 165 + ptr += 4; 166 + } 167 + 168 + mp_opt->subflow_seq = get_unaligned_be32(ptr); 169 + ptr += 4; 170 + 171 + mp_opt->data_len = get_unaligned_be16(ptr); 172 + ptr += 2; 173 + 174 + pr_debug("data_seq=%llu subflow_seq=%u data_len=%u", 175 + mp_opt->data_seq, mp_opt->subflow_seq, 176 + mp_opt->data_len); 177 + } 178 + 179 + break; 180 + 181 + default: 182 + break; 183 + } 184 + } 185 + 186 + void mptcp_get_options(const struct sk_buff *skb, 187 + struct tcp_options_received *opt_rx) 188 + { 189 + const unsigned char *ptr; 190 + const struct tcphdr *th = tcp_hdr(skb); 191 + int length = (th->doff * 4) - sizeof(struct tcphdr); 192 + 193 + ptr = (const unsigned char *)(th + 1); 194 + 195 + while (length > 0) { 196 + int opcode = *ptr++; 197 + int opsize; 198 + 199 + switch (opcode) { 200 + case TCPOPT_EOL: 201 + return; 202 + case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ 203 + length--; 204 + continue; 205 + default: 206 + opsize = *ptr++; 207 + if (opsize < 2) /* "silly options" */ 208 + return; 209 + if (opsize > length) 210 + return; /* don't parse partial options */ 211 + if (opcode == TCPOPT_MPTCP) 212 + mptcp_parse_option(skb, ptr, opsize, opt_rx); 213 + ptr += opsize - 2; 214 + length -= opsize; 215 + } 216 + } 217 + } 218 + 219 + bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb, 220 + unsigned int *size, struct mptcp_out_options *opts) 221 + { 222 + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 223 + 224 + /* we will use snd_isn to detect first pkt [re]transmission 225 + * in mptcp_established_options_mp() 226 + */ 227 + subflow->snd_isn = TCP_SKB_CB(skb)->end_seq; 228 + if (subflow->request_mptcp) { 229 + pr_debug("local_key=%llu", subflow->local_key); 230 + opts->suboptions = OPTION_MPTCP_MPC_SYN; 231 + opts->sndr_key = subflow->local_key; 232 + *size = TCPOLEN_MPTCP_MPC_SYN; 233 + return true; 234 + } 235 + return false; 236 + } 237 + 238 + void mptcp_rcv_synsent(struct sock *sk) 239 + { 240 + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 241 + struct tcp_sock *tp = tcp_sk(sk); 242 + 243 + pr_debug("subflow=%p", subflow); 244 + if (subflow->request_mptcp && tp->rx_opt.mptcp.mp_capable) { 245 + subflow->mp_capable = 1; 246 + subflow->can_ack = 1; 247 + subflow->remote_key = tp->rx_opt.mptcp.sndr_key; 248 + } else { 249 + tcp_sk(sk)->is_mptcp = 0; 250 + } 251 + } 252 + 253 + static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb, 254 + unsigned int *size, 255 + unsigned int remaining, 256 + struct mptcp_out_options *opts) 257 + { 258 + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 259 + struct mptcp_ext *mpext; 260 + unsigned int data_len; 261 + 262 + pr_debug("subflow=%p fourth_ack=%d seq=%x:%x remaining=%d", subflow, 263 + subflow->fourth_ack, subflow->snd_isn, 264 + skb ? TCP_SKB_CB(skb)->seq : 0, remaining); 265 + 266 + if (subflow->mp_capable && !subflow->fourth_ack && skb && 267 + subflow->snd_isn == TCP_SKB_CB(skb)->seq) { 268 + /* When skb is not available, we better over-estimate the 269 + * emitted options len. A full DSS option is longer than 270 + * TCPOLEN_MPTCP_MPC_ACK_DATA, so let's the caller try to fit 271 + * that. 272 + */ 273 + mpext = mptcp_get_ext(skb); 274 + data_len = mpext ? mpext->data_len : 0; 275 + 276 + /* we will check ext_copy.data_len in mptcp_write_options() to 277 + * discriminate between TCPOLEN_MPTCP_MPC_ACK_DATA and 278 + * TCPOLEN_MPTCP_MPC_ACK 279 + */ 280 + opts->ext_copy.data_len = data_len; 281 + opts->suboptions = OPTION_MPTCP_MPC_ACK; 282 + opts->sndr_key = subflow->local_key; 283 + opts->rcvr_key = subflow->remote_key; 284 + 285 + /* Section 3.1. 286 + * The MP_CAPABLE option is carried on the SYN, SYN/ACK, and ACK 287 + * packets that start the first subflow of an MPTCP connection, 288 + * as well as the first packet that carries data 289 + */ 290 + if (data_len > 0) 291 + *size = ALIGN(TCPOLEN_MPTCP_MPC_ACK_DATA, 4); 292 + else 293 + *size = TCPOLEN_MPTCP_MPC_ACK; 294 + 295 + pr_debug("subflow=%p, local_key=%llu, remote_key=%llu map_len=%d", 296 + subflow, subflow->local_key, subflow->remote_key, 297 + data_len); 298 + 299 + return true; 300 + } 301 + return false; 302 + } 303 + 304 + static void mptcp_write_data_fin(struct mptcp_subflow_context *subflow, 305 + struct mptcp_ext *ext) 306 + { 307 + ext->data_fin = 1; 308 + 309 + if (!ext->use_map) { 310 + /* RFC6824 requires a DSS mapping with specific values 311 + * if DATA_FIN is set but no data payload is mapped 312 + */ 313 + ext->use_map = 1; 314 + ext->dsn64 = 1; 315 + ext->data_seq = mptcp_sk(subflow->conn)->write_seq; 316 + ext->subflow_seq = 0; 317 + ext->data_len = 1; 318 + } else { 319 + /* If there's an existing DSS mapping, DATA_FIN consumes 320 + * 1 additional byte of mapping space. 321 + */ 322 + ext->data_len++; 323 + } 324 + } 325 + 326 + static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, 327 + unsigned int *size, 328 + unsigned int remaining, 329 + struct mptcp_out_options *opts) 330 + { 331 + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 332 + unsigned int dss_size = 0; 333 + struct mptcp_ext *mpext; 334 + struct mptcp_sock *msk; 335 + unsigned int ack_size; 336 + bool ret = false; 337 + u8 tcp_fin; 338 + 339 + if (skb) { 340 + mpext = mptcp_get_ext(skb); 341 + tcp_fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; 342 + } else { 343 + mpext = NULL; 344 + tcp_fin = 0; 345 + } 346 + 347 + if (!skb || (mpext && mpext->use_map) || tcp_fin) { 348 + unsigned int map_size; 349 + 350 + map_size = TCPOLEN_MPTCP_DSS_BASE + TCPOLEN_MPTCP_DSS_MAP64; 351 + 352 + remaining -= map_size; 353 + dss_size = map_size; 354 + if (mpext) 355 + opts->ext_copy = *mpext; 356 + 357 + if (skb && tcp_fin && 358 + subflow->conn->sk_state != TCP_ESTABLISHED) 359 + mptcp_write_data_fin(subflow, &opts->ext_copy); 360 + ret = true; 361 + } 362 + 363 + opts->ext_copy.use_ack = 0; 364 + msk = mptcp_sk(subflow->conn); 365 + if (!msk || !READ_ONCE(msk->can_ack)) { 366 + *size = ALIGN(dss_size, 4); 367 + return ret; 368 + } 369 + 370 + ack_size = TCPOLEN_MPTCP_DSS_ACK64; 371 + 372 + /* Add kind/length/subtype/flag overhead if mapping is not populated */ 373 + if (dss_size == 0) 374 + ack_size += TCPOLEN_MPTCP_DSS_BASE; 375 + 376 + dss_size += ack_size; 377 + 378 + opts->ext_copy.data_ack = msk->ack_seq; 379 + opts->ext_copy.ack64 = 1; 380 + opts->ext_copy.use_ack = 1; 381 + 382 + *size = ALIGN(dss_size, 4); 383 + return true; 384 + } 385 + 386 + bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, 387 + unsigned int *size, unsigned int remaining, 388 + struct mptcp_out_options *opts) 389 + { 390 + unsigned int opt_size = 0; 391 + bool ret = false; 392 + 393 + if (mptcp_established_options_mp(sk, skb, &opt_size, remaining, opts)) 394 + ret = true; 395 + else if (mptcp_established_options_dss(sk, skb, &opt_size, remaining, 396 + opts)) 397 + ret = true; 398 + 399 + /* we reserved enough space for the above options, and exceeding the 400 + * TCP option space would be fatal 401 + */ 402 + if (WARN_ON_ONCE(opt_size > remaining)) 403 + return false; 404 + 405 + *size += opt_size; 406 + remaining -= opt_size; 407 + 408 + return ret; 409 + } 410 + 411 + bool mptcp_synack_options(const struct request_sock *req, unsigned int *size, 412 + struct mptcp_out_options *opts) 413 + { 414 + struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); 415 + 416 + if (subflow_req->mp_capable) { 417 + opts->suboptions = OPTION_MPTCP_MPC_SYNACK; 418 + opts->sndr_key = subflow_req->local_key; 419 + *size = TCPOLEN_MPTCP_MPC_SYNACK; 420 + pr_debug("subflow_req=%p, local_key=%llu", 421 + subflow_req, subflow_req->local_key); 422 + return true; 423 + } 424 + return false; 425 + } 426 + 427 + static bool check_fourth_ack(struct mptcp_subflow_context *subflow, 428 + struct sk_buff *skb, 429 + struct mptcp_options_received *mp_opt) 430 + { 431 + /* here we can process OoO, in-window pkts, only in-sequence 4th ack 432 + * are relevant 433 + */ 434 + if (likely(subflow->fourth_ack || 435 + TCP_SKB_CB(skb)->seq != subflow->ssn_offset + 1)) 436 + return true; 437 + 438 + if (mp_opt->use_ack) 439 + subflow->fourth_ack = 1; 440 + 441 + if (subflow->can_ack) 442 + return true; 443 + 444 + /* If the first established packet does not contain MP_CAPABLE + data 445 + * then fallback to TCP 446 + */ 447 + if (!mp_opt->mp_capable) { 448 + subflow->mp_capable = 0; 449 + tcp_sk(mptcp_subflow_tcp_sock(subflow))->is_mptcp = 0; 450 + return false; 451 + } 452 + subflow->remote_key = mp_opt->sndr_key; 453 + subflow->can_ack = 1; 454 + return true; 455 + } 456 + 457 + void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb, 458 + struct tcp_options_received *opt_rx) 459 + { 460 + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 461 + struct mptcp_options_received *mp_opt; 462 + struct mptcp_ext *mpext; 463 + 464 + mp_opt = &opt_rx->mptcp; 465 + if (!check_fourth_ack(subflow, skb, mp_opt)) 466 + return; 467 + 468 + if (!mp_opt->dss) 469 + return; 470 + 471 + mpext = skb_ext_add(skb, SKB_EXT_MPTCP); 472 + if (!mpext) 473 + return; 474 + 475 + memset(mpext, 0, sizeof(*mpext)); 476 + 477 + if (mp_opt->use_map) { 478 + if (mp_opt->mpc_map) { 479 + /* this is an MP_CAPABLE carrying MPTCP data 480 + * we know this map the first chunk of data 481 + */ 482 + mptcp_crypto_key_sha(subflow->remote_key, NULL, 483 + &mpext->data_seq); 484 + mpext->data_seq++; 485 + mpext->subflow_seq = 1; 486 + mpext->dsn64 = 1; 487 + mpext->mpc_map = 1; 488 + } else { 489 + mpext->data_seq = mp_opt->data_seq; 490 + mpext->subflow_seq = mp_opt->subflow_seq; 491 + mpext->dsn64 = mp_opt->dsn64; 492 + } 493 + mpext->data_len = mp_opt->data_len; 494 + mpext->use_map = 1; 495 + } 496 + 497 + if (mp_opt->use_ack) { 498 + mpext->data_ack = mp_opt->data_ack; 499 + mpext->use_ack = 1; 500 + mpext->ack64 = mp_opt->ack64; 501 + } 502 + 503 + mpext->data_fin = mp_opt->data_fin; 504 + } 505 + 506 + void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts) 507 + { 508 + if ((OPTION_MPTCP_MPC_SYN | OPTION_MPTCP_MPC_SYNACK | 509 + OPTION_MPTCP_MPC_ACK) & opts->suboptions) { 510 + u8 len; 511 + 512 + if (OPTION_MPTCP_MPC_SYN & opts->suboptions) 513 + len = TCPOLEN_MPTCP_MPC_SYN; 514 + else if (OPTION_MPTCP_MPC_SYNACK & opts->suboptions) 515 + len = TCPOLEN_MPTCP_MPC_SYNACK; 516 + else if (opts->ext_copy.data_len) 517 + len = TCPOLEN_MPTCP_MPC_ACK_DATA; 518 + else 519 + len = TCPOLEN_MPTCP_MPC_ACK; 520 + 521 + *ptr++ = htonl((TCPOPT_MPTCP << 24) | (len << 16) | 522 + (MPTCPOPT_MP_CAPABLE << 12) | 523 + (MPTCP_SUPPORTED_VERSION << 8) | 524 + MPTCP_CAP_HMAC_SHA256); 525 + 526 + if (!((OPTION_MPTCP_MPC_SYNACK | OPTION_MPTCP_MPC_ACK) & 527 + opts->suboptions)) 528 + goto mp_capable_done; 529 + 530 + put_unaligned_be64(opts->sndr_key, ptr); 531 + ptr += 2; 532 + if (!((OPTION_MPTCP_MPC_ACK) & opts->suboptions)) 533 + goto mp_capable_done; 534 + 535 + put_unaligned_be64(opts->rcvr_key, ptr); 536 + ptr += 2; 537 + if (!opts->ext_copy.data_len) 538 + goto mp_capable_done; 539 + 540 + put_unaligned_be32(opts->ext_copy.data_len << 16 | 541 + TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); 542 + ptr += 1; 543 + } 544 + 545 + mp_capable_done: 546 + if (opts->ext_copy.use_ack || opts->ext_copy.use_map) { 547 + struct mptcp_ext *mpext = &opts->ext_copy; 548 + u8 len = TCPOLEN_MPTCP_DSS_BASE; 549 + u8 flags = 0; 550 + 551 + if (mpext->use_ack) { 552 + len += TCPOLEN_MPTCP_DSS_ACK64; 553 + flags = MPTCP_DSS_HAS_ACK | MPTCP_DSS_ACK64; 554 + } 555 + 556 + if (mpext->use_map) { 557 + len += TCPOLEN_MPTCP_DSS_MAP64; 558 + 559 + /* Use only 64-bit mapping flags for now, add 560 + * support for optional 32-bit mappings later. 561 + */ 562 + flags |= MPTCP_DSS_HAS_MAP | MPTCP_DSS_DSN64; 563 + if (mpext->data_fin) 564 + flags |= MPTCP_DSS_DATA_FIN; 565 + } 566 + 567 + *ptr++ = htonl((TCPOPT_MPTCP << 24) | 568 + (len << 16) | 569 + (MPTCPOPT_DSS << 12) | 570 + (flags)); 571 + 572 + if (mpext->use_ack) { 573 + put_unaligned_be64(mpext->data_ack, ptr); 574 + ptr += 2; 575 + } 576 + 577 + if (mpext->use_map) { 578 + put_unaligned_be64(mpext->data_seq, ptr); 579 + ptr += 2; 580 + put_unaligned_be32(mpext->subflow_seq, ptr); 581 + ptr += 1; 582 + put_unaligned_be32(mpext->data_len << 16 | 583 + TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); 584 + } 585 + } 586 + }
+1244
net/mptcp/protocol.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Multipath TCP 3 + * 4 + * Copyright (c) 2017 - 2019, Intel Corporation. 5 + */ 6 + 7 + #define pr_fmt(fmt) "MPTCP: " fmt 8 + 9 + #include <linux/kernel.h> 10 + #include <linux/module.h> 11 + #include <linux/netdevice.h> 12 + #include <linux/sched/signal.h> 13 + #include <linux/atomic.h> 14 + #include <net/sock.h> 15 + #include <net/inet_common.h> 16 + #include <net/inet_hashtables.h> 17 + #include <net/protocol.h> 18 + #include <net/tcp.h> 19 + #if IS_ENABLED(CONFIG_MPTCP_IPV6) 20 + #include <net/transp_v6.h> 21 + #endif 22 + #include <net/mptcp.h> 23 + #include "protocol.h" 24 + 25 + #define MPTCP_SAME_STATE TCP_MAX_STATES 26 + 27 + static void __mptcp_close(struct sock *sk, long timeout); 28 + 29 + static const struct proto_ops * tcp_proto_ops(struct sock *sk) 30 + { 31 + #if IS_ENABLED(CONFIG_IPV6) 32 + if (sk->sk_family == AF_INET6) 33 + return &inet6_stream_ops; 34 + #endif 35 + return &inet_stream_ops; 36 + } 37 + 38 + /* MP_CAPABLE handshake failed, convert msk to plain tcp, replacing 39 + * socket->sk and stream ops and destroying msk 40 + * return the msk socket, as we can't access msk anymore after this function 41 + * completes 42 + * Called with msk lock held, releases such lock before returning 43 + */ 44 + static struct socket *__mptcp_fallback_to_tcp(struct mptcp_sock *msk, 45 + struct sock *ssk) 46 + { 47 + struct mptcp_subflow_context *subflow; 48 + struct socket *sock; 49 + struct sock *sk; 50 + 51 + sk = (struct sock *)msk; 52 + sock = sk->sk_socket; 53 + subflow = mptcp_subflow_ctx(ssk); 54 + 55 + /* detach the msk socket */ 56 + list_del_init(&subflow->node); 57 + sock_orphan(sk); 58 + sock->sk = NULL; 59 + 60 + /* socket is now TCP */ 61 + lock_sock(ssk); 62 + sock_graft(ssk, sock); 63 + if (subflow->conn) { 64 + /* We can't release the ULP data on a live socket, 65 + * restore the tcp callback 66 + */ 67 + mptcp_subflow_tcp_fallback(ssk, subflow); 68 + sock_put(subflow->conn); 69 + subflow->conn = NULL; 70 + } 71 + release_sock(ssk); 72 + sock->ops = tcp_proto_ops(ssk); 73 + 74 + /* destroy the left-over msk sock */ 75 + __mptcp_close(sk, 0); 76 + return sock; 77 + } 78 + 79 + /* If msk has an initial subflow socket, and the MP_CAPABLE handshake has not 80 + * completed yet or has failed, return the subflow socket. 81 + * Otherwise return NULL. 82 + */ 83 + static struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk) 84 + { 85 + if (!msk->subflow || READ_ONCE(msk->can_ack)) 86 + return NULL; 87 + 88 + return msk->subflow; 89 + } 90 + 91 + static bool __mptcp_needs_tcp_fallback(const struct mptcp_sock *msk) 92 + { 93 + return msk->first && !sk_is_mptcp(msk->first); 94 + } 95 + 96 + /* if the mp_capable handshake has failed, it fallbacks msk to plain TCP, 97 + * releases the socket lock and returns a reference to the now TCP socket. 98 + * Otherwise returns NULL 99 + */ 100 + static struct socket *__mptcp_tcp_fallback(struct mptcp_sock *msk) 101 + { 102 + sock_owned_by_me((const struct sock *)msk); 103 + 104 + if (likely(!__mptcp_needs_tcp_fallback(msk))) 105 + return NULL; 106 + 107 + if (msk->subflow) { 108 + /* the first subflow is an active connection, discart the 109 + * paired socket 110 + */ 111 + msk->subflow->sk = NULL; 112 + sock_release(msk->subflow); 113 + msk->subflow = NULL; 114 + } 115 + 116 + return __mptcp_fallback_to_tcp(msk, msk->first); 117 + } 118 + 119 + static bool __mptcp_can_create_subflow(const struct mptcp_sock *msk) 120 + { 121 + return !msk->first; 122 + } 123 + 124 + static struct socket *__mptcp_socket_create(struct mptcp_sock *msk, int state) 125 + { 126 + struct mptcp_subflow_context *subflow; 127 + struct sock *sk = (struct sock *)msk; 128 + struct socket *ssock; 129 + int err; 130 + 131 + ssock = __mptcp_nmpc_socket(msk); 132 + if (ssock) 133 + goto set_state; 134 + 135 + if (!__mptcp_can_create_subflow(msk)) 136 + return ERR_PTR(-EINVAL); 137 + 138 + err = mptcp_subflow_create_socket(sk, &ssock); 139 + if (err) 140 + return ERR_PTR(err); 141 + 142 + msk->first = ssock->sk; 143 + msk->subflow = ssock; 144 + subflow = mptcp_subflow_ctx(ssock->sk); 145 + list_add(&subflow->node, &msk->conn_list); 146 + subflow->request_mptcp = 1; 147 + 148 + set_state: 149 + if (state != MPTCP_SAME_STATE) 150 + inet_sk_state_store(sk, state); 151 + return ssock; 152 + } 153 + 154 + static struct sock *mptcp_subflow_get(const struct mptcp_sock *msk) 155 + { 156 + struct mptcp_subflow_context *subflow; 157 + 158 + sock_owned_by_me((const struct sock *)msk); 159 + 160 + mptcp_for_each_subflow(msk, subflow) { 161 + return mptcp_subflow_tcp_sock(subflow); 162 + } 163 + 164 + return NULL; 165 + } 166 + 167 + static bool mptcp_ext_cache_refill(struct mptcp_sock *msk) 168 + { 169 + if (!msk->cached_ext) 170 + msk->cached_ext = __skb_ext_alloc(); 171 + 172 + return !!msk->cached_ext; 173 + } 174 + 175 + static struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk) 176 + { 177 + struct mptcp_subflow_context *subflow; 178 + struct sock *sk = (struct sock *)msk; 179 + 180 + sock_owned_by_me(sk); 181 + 182 + mptcp_for_each_subflow(msk, subflow) { 183 + if (subflow->data_avail) 184 + return mptcp_subflow_tcp_sock(subflow); 185 + } 186 + 187 + return NULL; 188 + } 189 + 190 + static inline bool mptcp_skb_can_collapse_to(const struct mptcp_sock *msk, 191 + const struct sk_buff *skb, 192 + const struct mptcp_ext *mpext) 193 + { 194 + if (!tcp_skb_can_collapse_to(skb)) 195 + return false; 196 + 197 + /* can collapse only if MPTCP level sequence is in order */ 198 + return mpext && mpext->data_seq + mpext->data_len == msk->write_seq; 199 + } 200 + 201 + static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, 202 + struct msghdr *msg, long *timeo, int *pmss_now, 203 + int *ps_goal) 204 + { 205 + int mss_now, avail_size, size_goal, ret; 206 + struct mptcp_sock *msk = mptcp_sk(sk); 207 + struct mptcp_ext *mpext = NULL; 208 + struct sk_buff *skb, *tail; 209 + bool can_collapse = false; 210 + struct page_frag *pfrag; 211 + size_t psize; 212 + 213 + /* use the mptcp page cache so that we can easily move the data 214 + * from one substream to another, but do per subflow memory accounting 215 + */ 216 + pfrag = sk_page_frag(sk); 217 + while (!sk_page_frag_refill(ssk, pfrag) || 218 + !mptcp_ext_cache_refill(msk)) { 219 + ret = sk_stream_wait_memory(ssk, timeo); 220 + if (ret) 221 + return ret; 222 + if (unlikely(__mptcp_needs_tcp_fallback(msk))) 223 + return 0; 224 + } 225 + 226 + /* compute copy limit */ 227 + mss_now = tcp_send_mss(ssk, &size_goal, msg->msg_flags); 228 + *pmss_now = mss_now; 229 + *ps_goal = size_goal; 230 + avail_size = size_goal; 231 + skb = tcp_write_queue_tail(ssk); 232 + if (skb) { 233 + mpext = skb_ext_find(skb, SKB_EXT_MPTCP); 234 + 235 + /* Limit the write to the size available in the 236 + * current skb, if any, so that we create at most a new skb. 237 + * Explicitly tells TCP internals to avoid collapsing on later 238 + * queue management operation, to avoid breaking the ext <-> 239 + * SSN association set here 240 + */ 241 + can_collapse = (size_goal - skb->len > 0) && 242 + mptcp_skb_can_collapse_to(msk, skb, mpext); 243 + if (!can_collapse) 244 + TCP_SKB_CB(skb)->eor = 1; 245 + else 246 + avail_size = size_goal - skb->len; 247 + } 248 + psize = min_t(size_t, pfrag->size - pfrag->offset, avail_size); 249 + 250 + /* Copy to page */ 251 + pr_debug("left=%zu", msg_data_left(msg)); 252 + psize = copy_page_from_iter(pfrag->page, pfrag->offset, 253 + min_t(size_t, msg_data_left(msg), psize), 254 + &msg->msg_iter); 255 + pr_debug("left=%zu", msg_data_left(msg)); 256 + if (!psize) 257 + return -EINVAL; 258 + 259 + /* tell the TCP stack to delay the push so that we can safely 260 + * access the skb after the sendpages call 261 + */ 262 + ret = do_tcp_sendpages(ssk, pfrag->page, pfrag->offset, psize, 263 + msg->msg_flags | MSG_SENDPAGE_NOTLAST); 264 + if (ret <= 0) 265 + return ret; 266 + if (unlikely(ret < psize)) 267 + iov_iter_revert(&msg->msg_iter, psize - ret); 268 + 269 + /* if the tail skb extension is still the cached one, collapsing 270 + * really happened. Note: we can't check for 'same skb' as the sk_buff 271 + * hdr on tail can be transmitted, freed and re-allocated by the 272 + * do_tcp_sendpages() call 273 + */ 274 + tail = tcp_write_queue_tail(ssk); 275 + if (mpext && tail && mpext == skb_ext_find(tail, SKB_EXT_MPTCP)) { 276 + WARN_ON_ONCE(!can_collapse); 277 + mpext->data_len += ret; 278 + goto out; 279 + } 280 + 281 + skb = tcp_write_queue_tail(ssk); 282 + mpext = __skb_ext_set(skb, SKB_EXT_MPTCP, msk->cached_ext); 283 + msk->cached_ext = NULL; 284 + 285 + memset(mpext, 0, sizeof(*mpext)); 286 + mpext->data_seq = msk->write_seq; 287 + mpext->subflow_seq = mptcp_subflow_ctx(ssk)->rel_write_seq; 288 + mpext->data_len = ret; 289 + mpext->use_map = 1; 290 + mpext->dsn64 = 1; 291 + 292 + pr_debug("data_seq=%llu subflow_seq=%u data_len=%u dsn64=%d", 293 + mpext->data_seq, mpext->subflow_seq, mpext->data_len, 294 + mpext->dsn64); 295 + 296 + out: 297 + pfrag->offset += ret; 298 + msk->write_seq += ret; 299 + mptcp_subflow_ctx(ssk)->rel_write_seq += ret; 300 + 301 + return ret; 302 + } 303 + 304 + static void ssk_check_wmem(struct mptcp_sock *msk, struct sock *ssk) 305 + { 306 + struct socket *sock; 307 + 308 + if (likely(sk_stream_is_writeable(ssk))) 309 + return; 310 + 311 + sock = READ_ONCE(ssk->sk_socket); 312 + 313 + if (sock) { 314 + clear_bit(MPTCP_SEND_SPACE, &msk->flags); 315 + smp_mb__after_atomic(); 316 + /* set NOSPACE only after clearing SEND_SPACE flag */ 317 + set_bit(SOCK_NOSPACE, &sock->flags); 318 + } 319 + } 320 + 321 + static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) 322 + { 323 + int mss_now = 0, size_goal = 0, ret = 0; 324 + struct mptcp_sock *msk = mptcp_sk(sk); 325 + struct socket *ssock; 326 + size_t copied = 0; 327 + struct sock *ssk; 328 + long timeo; 329 + 330 + if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL)) 331 + return -EOPNOTSUPP; 332 + 333 + lock_sock(sk); 334 + ssock = __mptcp_tcp_fallback(msk); 335 + if (unlikely(ssock)) { 336 + fallback: 337 + pr_debug("fallback passthrough"); 338 + ret = sock_sendmsg(ssock, msg); 339 + return ret >= 0 ? ret + copied : (copied ? copied : ret); 340 + } 341 + 342 + timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); 343 + 344 + ssk = mptcp_subflow_get(msk); 345 + if (!ssk) { 346 + release_sock(sk); 347 + return -ENOTCONN; 348 + } 349 + 350 + pr_debug("conn_list->subflow=%p", ssk); 351 + 352 + lock_sock(ssk); 353 + while (msg_data_left(msg)) { 354 + ret = mptcp_sendmsg_frag(sk, ssk, msg, &timeo, &mss_now, 355 + &size_goal); 356 + if (ret < 0) 357 + break; 358 + if (ret == 0 && unlikely(__mptcp_needs_tcp_fallback(msk))) { 359 + release_sock(ssk); 360 + ssock = __mptcp_tcp_fallback(msk); 361 + goto fallback; 362 + } 363 + 364 + copied += ret; 365 + } 366 + 367 + if (copied) { 368 + ret = copied; 369 + tcp_push(ssk, msg->msg_flags, mss_now, tcp_sk(ssk)->nonagle, 370 + size_goal); 371 + } 372 + 373 + ssk_check_wmem(msk, ssk); 374 + release_sock(ssk); 375 + release_sock(sk); 376 + return ret; 377 + } 378 + 379 + int mptcp_read_actor(read_descriptor_t *desc, struct sk_buff *skb, 380 + unsigned int offset, size_t len) 381 + { 382 + struct mptcp_read_arg *arg = desc->arg.data; 383 + size_t copy_len; 384 + 385 + copy_len = min(desc->count, len); 386 + 387 + if (likely(arg->msg)) { 388 + int err; 389 + 390 + err = skb_copy_datagram_msg(skb, offset, arg->msg, copy_len); 391 + if (err) { 392 + pr_debug("error path"); 393 + desc->error = err; 394 + return err; 395 + } 396 + } else { 397 + pr_debug("Flushing skb payload"); 398 + } 399 + 400 + desc->count -= copy_len; 401 + 402 + pr_debug("consumed %zu bytes, %zu left", copy_len, desc->count); 403 + return copy_len; 404 + } 405 + 406 + static void mptcp_wait_data(struct sock *sk, long *timeo) 407 + { 408 + DEFINE_WAIT_FUNC(wait, woken_wake_function); 409 + struct mptcp_sock *msk = mptcp_sk(sk); 410 + 411 + add_wait_queue(sk_sleep(sk), &wait); 412 + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); 413 + 414 + sk_wait_event(sk, timeo, 415 + test_and_clear_bit(MPTCP_DATA_READY, &msk->flags), &wait); 416 + 417 + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); 418 + remove_wait_queue(sk_sleep(sk), &wait); 419 + } 420 + 421 + static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, 422 + int nonblock, int flags, int *addr_len) 423 + { 424 + struct mptcp_sock *msk = mptcp_sk(sk); 425 + struct mptcp_subflow_context *subflow; 426 + bool more_data_avail = false; 427 + struct mptcp_read_arg arg; 428 + read_descriptor_t desc; 429 + bool wait_data = false; 430 + struct socket *ssock; 431 + struct tcp_sock *tp; 432 + bool done = false; 433 + struct sock *ssk; 434 + int copied = 0; 435 + int target; 436 + long timeo; 437 + 438 + if (msg->msg_flags & ~(MSG_WAITALL | MSG_DONTWAIT)) 439 + return -EOPNOTSUPP; 440 + 441 + lock_sock(sk); 442 + ssock = __mptcp_tcp_fallback(msk); 443 + if (unlikely(ssock)) { 444 + fallback: 445 + pr_debug("fallback-read subflow=%p", 446 + mptcp_subflow_ctx(ssock->sk)); 447 + copied = sock_recvmsg(ssock, msg, flags); 448 + return copied; 449 + } 450 + 451 + arg.msg = msg; 452 + desc.arg.data = &arg; 453 + desc.error = 0; 454 + 455 + timeo = sock_rcvtimeo(sk, nonblock); 456 + 457 + len = min_t(size_t, len, INT_MAX); 458 + target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); 459 + 460 + while (!done) { 461 + u32 map_remaining; 462 + int bytes_read; 463 + 464 + ssk = mptcp_subflow_recv_lookup(msk); 465 + pr_debug("msk=%p ssk=%p", msk, ssk); 466 + if (!ssk) 467 + goto wait_for_data; 468 + 469 + subflow = mptcp_subflow_ctx(ssk); 470 + tp = tcp_sk(ssk); 471 + 472 + lock_sock(ssk); 473 + do { 474 + /* try to read as much data as available */ 475 + map_remaining = subflow->map_data_len - 476 + mptcp_subflow_get_map_offset(subflow); 477 + desc.count = min_t(size_t, len - copied, map_remaining); 478 + pr_debug("reading %zu bytes, copied %d", desc.count, 479 + copied); 480 + bytes_read = tcp_read_sock(ssk, &desc, 481 + mptcp_read_actor); 482 + if (bytes_read < 0) { 483 + if (!copied) 484 + copied = bytes_read; 485 + done = true; 486 + goto next; 487 + } 488 + 489 + pr_debug("msk ack_seq=%llx -> %llx", msk->ack_seq, 490 + msk->ack_seq + bytes_read); 491 + msk->ack_seq += bytes_read; 492 + copied += bytes_read; 493 + if (copied >= len) { 494 + done = true; 495 + goto next; 496 + } 497 + if (tp->urg_data && tp->urg_seq == tp->copied_seq) { 498 + pr_err("Urgent data present, cannot proceed"); 499 + done = true; 500 + goto next; 501 + } 502 + next: 503 + more_data_avail = mptcp_subflow_data_available(ssk); 504 + } while (more_data_avail && !done); 505 + release_sock(ssk); 506 + continue; 507 + 508 + wait_for_data: 509 + more_data_avail = false; 510 + 511 + /* only the master socket status is relevant here. The exit 512 + * conditions mirror closely tcp_recvmsg() 513 + */ 514 + if (copied >= target) 515 + break; 516 + 517 + if (copied) { 518 + if (sk->sk_err || 519 + sk->sk_state == TCP_CLOSE || 520 + (sk->sk_shutdown & RCV_SHUTDOWN) || 521 + !timeo || 522 + signal_pending(current)) 523 + break; 524 + } else { 525 + if (sk->sk_err) { 526 + copied = sock_error(sk); 527 + break; 528 + } 529 + 530 + if (sk->sk_shutdown & RCV_SHUTDOWN) 531 + break; 532 + 533 + if (sk->sk_state == TCP_CLOSE) { 534 + copied = -ENOTCONN; 535 + break; 536 + } 537 + 538 + if (!timeo) { 539 + copied = -EAGAIN; 540 + break; 541 + } 542 + 543 + if (signal_pending(current)) { 544 + copied = sock_intr_errno(timeo); 545 + break; 546 + } 547 + } 548 + 549 + pr_debug("block timeout %ld", timeo); 550 + wait_data = true; 551 + mptcp_wait_data(sk, &timeo); 552 + if (unlikely(__mptcp_tcp_fallback(msk))) 553 + goto fallback; 554 + } 555 + 556 + if (more_data_avail) { 557 + if (!test_bit(MPTCP_DATA_READY, &msk->flags)) 558 + set_bit(MPTCP_DATA_READY, &msk->flags); 559 + } else if (!wait_data) { 560 + clear_bit(MPTCP_DATA_READY, &msk->flags); 561 + 562 + /* .. race-breaker: ssk might get new data after last 563 + * data_available() returns false. 564 + */ 565 + ssk = mptcp_subflow_recv_lookup(msk); 566 + if (unlikely(ssk)) 567 + set_bit(MPTCP_DATA_READY, &msk->flags); 568 + } 569 + 570 + release_sock(sk); 571 + return copied; 572 + } 573 + 574 + /* subflow sockets can be either outgoing (connect) or incoming 575 + * (accept). 576 + * 577 + * Outgoing subflows use in-kernel sockets. 578 + * Incoming subflows do not have their own 'struct socket' allocated, 579 + * so we need to use tcp_close() after detaching them from the mptcp 580 + * parent socket. 581 + */ 582 + static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, 583 + struct mptcp_subflow_context *subflow, 584 + long timeout) 585 + { 586 + struct socket *sock = READ_ONCE(ssk->sk_socket); 587 + 588 + list_del(&subflow->node); 589 + 590 + if (sock && sock != sk->sk_socket) { 591 + /* outgoing subflow */ 592 + sock_release(sock); 593 + } else { 594 + /* incoming subflow */ 595 + tcp_close(ssk, timeout); 596 + } 597 + } 598 + 599 + static int __mptcp_init_sock(struct sock *sk) 600 + { 601 + struct mptcp_sock *msk = mptcp_sk(sk); 602 + 603 + INIT_LIST_HEAD(&msk->conn_list); 604 + __set_bit(MPTCP_SEND_SPACE, &msk->flags); 605 + 606 + msk->first = NULL; 607 + 608 + return 0; 609 + } 610 + 611 + static int mptcp_init_sock(struct sock *sk) 612 + { 613 + if (!mptcp_is_enabled(sock_net(sk))) 614 + return -ENOPROTOOPT; 615 + 616 + return __mptcp_init_sock(sk); 617 + } 618 + 619 + static void mptcp_subflow_shutdown(struct sock *ssk, int how) 620 + { 621 + lock_sock(ssk); 622 + 623 + switch (ssk->sk_state) { 624 + case TCP_LISTEN: 625 + if (!(how & RCV_SHUTDOWN)) 626 + break; 627 + /* fall through */ 628 + case TCP_SYN_SENT: 629 + tcp_disconnect(ssk, O_NONBLOCK); 630 + break; 631 + default: 632 + ssk->sk_shutdown |= how; 633 + tcp_shutdown(ssk, how); 634 + break; 635 + } 636 + 637 + /* Wake up anyone sleeping in poll. */ 638 + ssk->sk_state_change(ssk); 639 + release_sock(ssk); 640 + } 641 + 642 + /* Called with msk lock held, releases such lock before returning */ 643 + static void __mptcp_close(struct sock *sk, long timeout) 644 + { 645 + struct mptcp_subflow_context *subflow, *tmp; 646 + struct mptcp_sock *msk = mptcp_sk(sk); 647 + 648 + mptcp_token_destroy(msk->token); 649 + inet_sk_state_store(sk, TCP_CLOSE); 650 + 651 + list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) { 652 + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); 653 + 654 + __mptcp_close_ssk(sk, ssk, subflow, timeout); 655 + } 656 + 657 + if (msk->cached_ext) 658 + __skb_ext_put(msk->cached_ext); 659 + release_sock(sk); 660 + sk_common_release(sk); 661 + } 662 + 663 + static void mptcp_close(struct sock *sk, long timeout) 664 + { 665 + lock_sock(sk); 666 + __mptcp_close(sk, timeout); 667 + } 668 + 669 + static void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk) 670 + { 671 + #if IS_ENABLED(CONFIG_MPTCP_IPV6) 672 + const struct ipv6_pinfo *ssk6 = inet6_sk(ssk); 673 + struct ipv6_pinfo *msk6 = inet6_sk(msk); 674 + 675 + msk->sk_v6_daddr = ssk->sk_v6_daddr; 676 + msk->sk_v6_rcv_saddr = ssk->sk_v6_rcv_saddr; 677 + 678 + if (msk6 && ssk6) { 679 + msk6->saddr = ssk6->saddr; 680 + msk6->flow_label = ssk6->flow_label; 681 + } 682 + #endif 683 + 684 + inet_sk(msk)->inet_num = inet_sk(ssk)->inet_num; 685 + inet_sk(msk)->inet_dport = inet_sk(ssk)->inet_dport; 686 + inet_sk(msk)->inet_sport = inet_sk(ssk)->inet_sport; 687 + inet_sk(msk)->inet_daddr = inet_sk(ssk)->inet_daddr; 688 + inet_sk(msk)->inet_saddr = inet_sk(ssk)->inet_saddr; 689 + inet_sk(msk)->inet_rcv_saddr = inet_sk(ssk)->inet_rcv_saddr; 690 + } 691 + 692 + static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, 693 + bool kern) 694 + { 695 + struct mptcp_sock *msk = mptcp_sk(sk); 696 + struct socket *listener; 697 + struct sock *newsk; 698 + 699 + listener = __mptcp_nmpc_socket(msk); 700 + if (WARN_ON_ONCE(!listener)) { 701 + *err = -EINVAL; 702 + return NULL; 703 + } 704 + 705 + pr_debug("msk=%p, listener=%p", msk, mptcp_subflow_ctx(listener->sk)); 706 + newsk = inet_csk_accept(listener->sk, flags, err, kern); 707 + if (!newsk) 708 + return NULL; 709 + 710 + pr_debug("msk=%p, subflow is mptcp=%d", msk, sk_is_mptcp(newsk)); 711 + 712 + if (sk_is_mptcp(newsk)) { 713 + struct mptcp_subflow_context *subflow; 714 + struct sock *new_mptcp_sock; 715 + struct sock *ssk = newsk; 716 + u64 ack_seq; 717 + 718 + subflow = mptcp_subflow_ctx(newsk); 719 + lock_sock(sk); 720 + 721 + local_bh_disable(); 722 + new_mptcp_sock = sk_clone_lock(sk, GFP_ATOMIC); 723 + if (!new_mptcp_sock) { 724 + *err = -ENOBUFS; 725 + local_bh_enable(); 726 + release_sock(sk); 727 + mptcp_subflow_shutdown(newsk, SHUT_RDWR + 1); 728 + tcp_close(newsk, 0); 729 + return NULL; 730 + } 731 + 732 + __mptcp_init_sock(new_mptcp_sock); 733 + 734 + msk = mptcp_sk(new_mptcp_sock); 735 + msk->local_key = subflow->local_key; 736 + msk->token = subflow->token; 737 + msk->subflow = NULL; 738 + msk->first = newsk; 739 + 740 + mptcp_token_update_accept(newsk, new_mptcp_sock); 741 + 742 + msk->write_seq = subflow->idsn + 1; 743 + if (subflow->can_ack) { 744 + msk->can_ack = true; 745 + msk->remote_key = subflow->remote_key; 746 + mptcp_crypto_key_sha(msk->remote_key, NULL, &ack_seq); 747 + ack_seq++; 748 + msk->ack_seq = ack_seq; 749 + } 750 + newsk = new_mptcp_sock; 751 + mptcp_copy_inaddrs(newsk, ssk); 752 + list_add(&subflow->node, &msk->conn_list); 753 + 754 + /* will be fully established at mptcp_stream_accept() 755 + * completion. 756 + */ 757 + inet_sk_state_store(new_mptcp_sock, TCP_SYN_RECV); 758 + bh_unlock_sock(new_mptcp_sock); 759 + local_bh_enable(); 760 + release_sock(sk); 761 + 762 + /* the subflow can already receive packet, avoid racing with 763 + * the receive path and process the pending ones 764 + */ 765 + lock_sock(ssk); 766 + subflow->rel_write_seq = 1; 767 + subflow->tcp_sock = ssk; 768 + subflow->conn = new_mptcp_sock; 769 + if (unlikely(!skb_queue_empty(&ssk->sk_receive_queue))) 770 + mptcp_subflow_data_available(ssk); 771 + release_sock(ssk); 772 + } 773 + 774 + return newsk; 775 + } 776 + 777 + static void mptcp_destroy(struct sock *sk) 778 + { 779 + } 780 + 781 + static int mptcp_setsockopt(struct sock *sk, int level, int optname, 782 + char __user *uoptval, unsigned int optlen) 783 + { 784 + struct mptcp_sock *msk = mptcp_sk(sk); 785 + char __kernel *optval; 786 + int ret = -EOPNOTSUPP; 787 + struct socket *ssock; 788 + 789 + /* will be treated as __user in tcp_setsockopt */ 790 + optval = (char __kernel __force *)uoptval; 791 + 792 + pr_debug("msk=%p", msk); 793 + 794 + /* @@ the meaning of setsockopt() when the socket is connected and 795 + * there are multiple subflows is not defined. 796 + */ 797 + lock_sock(sk); 798 + ssock = __mptcp_socket_create(msk, MPTCP_SAME_STATE); 799 + if (!IS_ERR(ssock)) { 800 + pr_debug("subflow=%p", ssock->sk); 801 + ret = kernel_setsockopt(ssock, level, optname, optval, optlen); 802 + } 803 + release_sock(sk); 804 + 805 + return ret; 806 + } 807 + 808 + static int mptcp_getsockopt(struct sock *sk, int level, int optname, 809 + char __user *uoptval, int __user *uoption) 810 + { 811 + struct mptcp_sock *msk = mptcp_sk(sk); 812 + char __kernel *optval; 813 + int ret = -EOPNOTSUPP; 814 + int __kernel *option; 815 + struct socket *ssock; 816 + 817 + /* will be treated as __user in tcp_getsockopt */ 818 + optval = (char __kernel __force *)uoptval; 819 + option = (int __kernel __force *)uoption; 820 + 821 + pr_debug("msk=%p", msk); 822 + 823 + /* @@ the meaning of getsockopt() when the socket is connected and 824 + * there are multiple subflows is not defined. 825 + */ 826 + lock_sock(sk); 827 + ssock = __mptcp_socket_create(msk, MPTCP_SAME_STATE); 828 + if (!IS_ERR(ssock)) { 829 + pr_debug("subflow=%p", ssock->sk); 830 + ret = kernel_getsockopt(ssock, level, optname, optval, option); 831 + } 832 + release_sock(sk); 833 + 834 + return ret; 835 + } 836 + 837 + static int mptcp_get_port(struct sock *sk, unsigned short snum) 838 + { 839 + struct mptcp_sock *msk = mptcp_sk(sk); 840 + struct socket *ssock; 841 + 842 + ssock = __mptcp_nmpc_socket(msk); 843 + pr_debug("msk=%p, subflow=%p", msk, ssock); 844 + if (WARN_ON_ONCE(!ssock)) 845 + return -EINVAL; 846 + 847 + return inet_csk_get_port(ssock->sk, snum); 848 + } 849 + 850 + void mptcp_finish_connect(struct sock *ssk) 851 + { 852 + struct mptcp_subflow_context *subflow; 853 + struct mptcp_sock *msk; 854 + struct sock *sk; 855 + u64 ack_seq; 856 + 857 + subflow = mptcp_subflow_ctx(ssk); 858 + 859 + if (!subflow->mp_capable) 860 + return; 861 + 862 + sk = subflow->conn; 863 + msk = mptcp_sk(sk); 864 + 865 + pr_debug("msk=%p, token=%u", sk, subflow->token); 866 + 867 + mptcp_crypto_key_sha(subflow->remote_key, NULL, &ack_seq); 868 + ack_seq++; 869 + subflow->map_seq = ack_seq; 870 + subflow->map_subflow_seq = 1; 871 + subflow->rel_write_seq = 1; 872 + 873 + /* the socket is not connected yet, no msk/subflow ops can access/race 874 + * accessing the field below 875 + */ 876 + WRITE_ONCE(msk->remote_key, subflow->remote_key); 877 + WRITE_ONCE(msk->local_key, subflow->local_key); 878 + WRITE_ONCE(msk->token, subflow->token); 879 + WRITE_ONCE(msk->write_seq, subflow->idsn + 1); 880 + WRITE_ONCE(msk->ack_seq, ack_seq); 881 + WRITE_ONCE(msk->can_ack, 1); 882 + } 883 + 884 + static void mptcp_sock_graft(struct sock *sk, struct socket *parent) 885 + { 886 + write_lock_bh(&sk->sk_callback_lock); 887 + rcu_assign_pointer(sk->sk_wq, &parent->wq); 888 + sk_set_socket(sk, parent); 889 + sk->sk_uid = SOCK_INODE(parent)->i_uid; 890 + write_unlock_bh(&sk->sk_callback_lock); 891 + } 892 + 893 + static bool mptcp_memory_free(const struct sock *sk, int wake) 894 + { 895 + struct mptcp_sock *msk = mptcp_sk(sk); 896 + 897 + return wake ? test_bit(MPTCP_SEND_SPACE, &msk->flags) : true; 898 + } 899 + 900 + static struct proto mptcp_prot = { 901 + .name = "MPTCP", 902 + .owner = THIS_MODULE, 903 + .init = mptcp_init_sock, 904 + .close = mptcp_close, 905 + .accept = mptcp_accept, 906 + .setsockopt = mptcp_setsockopt, 907 + .getsockopt = mptcp_getsockopt, 908 + .shutdown = tcp_shutdown, 909 + .destroy = mptcp_destroy, 910 + .sendmsg = mptcp_sendmsg, 911 + .recvmsg = mptcp_recvmsg, 912 + .hash = inet_hash, 913 + .unhash = inet_unhash, 914 + .get_port = mptcp_get_port, 915 + .stream_memory_free = mptcp_memory_free, 916 + .obj_size = sizeof(struct mptcp_sock), 917 + .no_autobind = true, 918 + }; 919 + 920 + static int mptcp_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) 921 + { 922 + struct mptcp_sock *msk = mptcp_sk(sock->sk); 923 + struct socket *ssock; 924 + int err; 925 + 926 + lock_sock(sock->sk); 927 + ssock = __mptcp_socket_create(msk, MPTCP_SAME_STATE); 928 + if (IS_ERR(ssock)) { 929 + err = PTR_ERR(ssock); 930 + goto unlock; 931 + } 932 + 933 + err = ssock->ops->bind(ssock, uaddr, addr_len); 934 + if (!err) 935 + mptcp_copy_inaddrs(sock->sk, ssock->sk); 936 + 937 + unlock: 938 + release_sock(sock->sk); 939 + return err; 940 + } 941 + 942 + static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr, 943 + int addr_len, int flags) 944 + { 945 + struct mptcp_sock *msk = mptcp_sk(sock->sk); 946 + struct socket *ssock; 947 + int err; 948 + 949 + lock_sock(sock->sk); 950 + ssock = __mptcp_socket_create(msk, TCP_SYN_SENT); 951 + if (IS_ERR(ssock)) { 952 + err = PTR_ERR(ssock); 953 + goto unlock; 954 + } 955 + 956 + #ifdef CONFIG_TCP_MD5SIG 957 + /* no MPTCP if MD5SIG is enabled on this socket or we may run out of 958 + * TCP option space. 959 + */ 960 + if (rcu_access_pointer(tcp_sk(ssock->sk)->md5sig_info)) 961 + mptcp_subflow_ctx(ssock->sk)->request_mptcp = 0; 962 + #endif 963 + 964 + err = ssock->ops->connect(ssock, uaddr, addr_len, flags); 965 + inet_sk_state_store(sock->sk, inet_sk_state_load(ssock->sk)); 966 + mptcp_copy_inaddrs(sock->sk, ssock->sk); 967 + 968 + unlock: 969 + release_sock(sock->sk); 970 + return err; 971 + } 972 + 973 + static int mptcp_v4_getname(struct socket *sock, struct sockaddr *uaddr, 974 + int peer) 975 + { 976 + if (sock->sk->sk_prot == &tcp_prot) { 977 + /* we are being invoked from __sys_accept4, after 978 + * mptcp_accept() has just accepted a non-mp-capable 979 + * flow: sk is a tcp_sk, not an mptcp one. 980 + * 981 + * Hand the socket over to tcp so all further socket ops 982 + * bypass mptcp. 983 + */ 984 + sock->ops = &inet_stream_ops; 985 + } 986 + 987 + return inet_getname(sock, uaddr, peer); 988 + } 989 + 990 + #if IS_ENABLED(CONFIG_MPTCP_IPV6) 991 + static int mptcp_v6_getname(struct socket *sock, struct sockaddr *uaddr, 992 + int peer) 993 + { 994 + if (sock->sk->sk_prot == &tcpv6_prot) { 995 + /* we are being invoked from __sys_accept4 after 996 + * mptcp_accept() has accepted a non-mp-capable 997 + * subflow: sk is a tcp_sk, not mptcp. 998 + * 999 + * Hand the socket over to tcp so all further 1000 + * socket ops bypass mptcp. 1001 + */ 1002 + sock->ops = &inet6_stream_ops; 1003 + } 1004 + 1005 + return inet6_getname(sock, uaddr, peer); 1006 + } 1007 + #endif 1008 + 1009 + static int mptcp_listen(struct socket *sock, int backlog) 1010 + { 1011 + struct mptcp_sock *msk = mptcp_sk(sock->sk); 1012 + struct socket *ssock; 1013 + int err; 1014 + 1015 + pr_debug("msk=%p", msk); 1016 + 1017 + lock_sock(sock->sk); 1018 + ssock = __mptcp_socket_create(msk, TCP_LISTEN); 1019 + if (IS_ERR(ssock)) { 1020 + err = PTR_ERR(ssock); 1021 + goto unlock; 1022 + } 1023 + 1024 + err = ssock->ops->listen(ssock, backlog); 1025 + inet_sk_state_store(sock->sk, inet_sk_state_load(ssock->sk)); 1026 + if (!err) 1027 + mptcp_copy_inaddrs(sock->sk, ssock->sk); 1028 + 1029 + unlock: 1030 + release_sock(sock->sk); 1031 + return err; 1032 + } 1033 + 1034 + static bool is_tcp_proto(const struct proto *p) 1035 + { 1036 + #if IS_ENABLED(CONFIG_MPTCP_IPV6) 1037 + return p == &tcp_prot || p == &tcpv6_prot; 1038 + #else 1039 + return p == &tcp_prot; 1040 + #endif 1041 + } 1042 + 1043 + static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, 1044 + int flags, bool kern) 1045 + { 1046 + struct mptcp_sock *msk = mptcp_sk(sock->sk); 1047 + struct socket *ssock; 1048 + int err; 1049 + 1050 + pr_debug("msk=%p", msk); 1051 + 1052 + lock_sock(sock->sk); 1053 + if (sock->sk->sk_state != TCP_LISTEN) 1054 + goto unlock_fail; 1055 + 1056 + ssock = __mptcp_nmpc_socket(msk); 1057 + if (!ssock) 1058 + goto unlock_fail; 1059 + 1060 + sock_hold(ssock->sk); 1061 + release_sock(sock->sk); 1062 + 1063 + err = ssock->ops->accept(sock, newsock, flags, kern); 1064 + if (err == 0 && !is_tcp_proto(newsock->sk->sk_prot)) { 1065 + struct mptcp_sock *msk = mptcp_sk(newsock->sk); 1066 + struct mptcp_subflow_context *subflow; 1067 + 1068 + /* set ssk->sk_socket of accept()ed flows to mptcp socket. 1069 + * This is needed so NOSPACE flag can be set from tcp stack. 1070 + */ 1071 + list_for_each_entry(subflow, &msk->conn_list, node) { 1072 + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); 1073 + 1074 + if (!ssk->sk_socket) 1075 + mptcp_sock_graft(ssk, newsock); 1076 + } 1077 + 1078 + inet_sk_state_store(newsock->sk, TCP_ESTABLISHED); 1079 + } 1080 + 1081 + sock_put(ssock->sk); 1082 + return err; 1083 + 1084 + unlock_fail: 1085 + release_sock(sock->sk); 1086 + return -EINVAL; 1087 + } 1088 + 1089 + static __poll_t mptcp_poll(struct file *file, struct socket *sock, 1090 + struct poll_table_struct *wait) 1091 + { 1092 + struct sock *sk = sock->sk; 1093 + struct mptcp_sock *msk; 1094 + struct socket *ssock; 1095 + __poll_t mask = 0; 1096 + 1097 + msk = mptcp_sk(sk); 1098 + lock_sock(sk); 1099 + ssock = __mptcp_nmpc_socket(msk); 1100 + if (ssock) { 1101 + mask = ssock->ops->poll(file, ssock, wait); 1102 + release_sock(sk); 1103 + return mask; 1104 + } 1105 + 1106 + release_sock(sk); 1107 + sock_poll_wait(file, sock, wait); 1108 + lock_sock(sk); 1109 + ssock = __mptcp_tcp_fallback(msk); 1110 + if (unlikely(ssock)) 1111 + return ssock->ops->poll(file, ssock, NULL); 1112 + 1113 + if (test_bit(MPTCP_DATA_READY, &msk->flags)) 1114 + mask = EPOLLIN | EPOLLRDNORM; 1115 + if (sk_stream_is_writeable(sk) && 1116 + test_bit(MPTCP_SEND_SPACE, &msk->flags)) 1117 + mask |= EPOLLOUT | EPOLLWRNORM; 1118 + if (sk->sk_shutdown & RCV_SHUTDOWN) 1119 + mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; 1120 + 1121 + release_sock(sk); 1122 + 1123 + return mask; 1124 + } 1125 + 1126 + static int mptcp_shutdown(struct socket *sock, int how) 1127 + { 1128 + struct mptcp_sock *msk = mptcp_sk(sock->sk); 1129 + struct mptcp_subflow_context *subflow; 1130 + int ret = 0; 1131 + 1132 + pr_debug("sk=%p, how=%d", msk, how); 1133 + 1134 + lock_sock(sock->sk); 1135 + 1136 + if (how == SHUT_WR || how == SHUT_RDWR) 1137 + inet_sk_state_store(sock->sk, TCP_FIN_WAIT1); 1138 + 1139 + how++; 1140 + 1141 + if ((how & ~SHUTDOWN_MASK) || !how) { 1142 + ret = -EINVAL; 1143 + goto out_unlock; 1144 + } 1145 + 1146 + if (sock->state == SS_CONNECTING) { 1147 + if ((1 << sock->sk->sk_state) & 1148 + (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE)) 1149 + sock->state = SS_DISCONNECTING; 1150 + else 1151 + sock->state = SS_CONNECTED; 1152 + } 1153 + 1154 + mptcp_for_each_subflow(msk, subflow) { 1155 + struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); 1156 + 1157 + mptcp_subflow_shutdown(tcp_sk, how); 1158 + } 1159 + 1160 + out_unlock: 1161 + release_sock(sock->sk); 1162 + 1163 + return ret; 1164 + } 1165 + 1166 + static struct proto_ops mptcp_stream_ops; 1167 + 1168 + static struct inet_protosw mptcp_protosw = { 1169 + .type = SOCK_STREAM, 1170 + .protocol = IPPROTO_MPTCP, 1171 + .prot = &mptcp_prot, 1172 + .ops = &mptcp_stream_ops, 1173 + .flags = INET_PROTOSW_ICSK, 1174 + }; 1175 + 1176 + void mptcp_proto_init(void) 1177 + { 1178 + mptcp_prot.h.hashinfo = tcp_prot.h.hashinfo; 1179 + mptcp_stream_ops = inet_stream_ops; 1180 + mptcp_stream_ops.bind = mptcp_bind; 1181 + mptcp_stream_ops.connect = mptcp_stream_connect; 1182 + mptcp_stream_ops.poll = mptcp_poll; 1183 + mptcp_stream_ops.accept = mptcp_stream_accept; 1184 + mptcp_stream_ops.getname = mptcp_v4_getname; 1185 + mptcp_stream_ops.listen = mptcp_listen; 1186 + mptcp_stream_ops.shutdown = mptcp_shutdown; 1187 + 1188 + mptcp_subflow_init(); 1189 + 1190 + if (proto_register(&mptcp_prot, 1) != 0) 1191 + panic("Failed to register MPTCP proto.\n"); 1192 + 1193 + inet_register_protosw(&mptcp_protosw); 1194 + } 1195 + 1196 + #if IS_ENABLED(CONFIG_MPTCP_IPV6) 1197 + static struct proto_ops mptcp_v6_stream_ops; 1198 + static struct proto mptcp_v6_prot; 1199 + 1200 + static void mptcp_v6_destroy(struct sock *sk) 1201 + { 1202 + mptcp_destroy(sk); 1203 + inet6_destroy_sock(sk); 1204 + } 1205 + 1206 + static struct inet_protosw mptcp_v6_protosw = { 1207 + .type = SOCK_STREAM, 1208 + .protocol = IPPROTO_MPTCP, 1209 + .prot = &mptcp_v6_prot, 1210 + .ops = &mptcp_v6_stream_ops, 1211 + .flags = INET_PROTOSW_ICSK, 1212 + }; 1213 + 1214 + int mptcp_proto_v6_init(void) 1215 + { 1216 + int err; 1217 + 1218 + mptcp_v6_prot = mptcp_prot; 1219 + strcpy(mptcp_v6_prot.name, "MPTCPv6"); 1220 + mptcp_v6_prot.slab = NULL; 1221 + mptcp_v6_prot.destroy = mptcp_v6_destroy; 1222 + mptcp_v6_prot.obj_size = sizeof(struct mptcp_sock) + 1223 + sizeof(struct ipv6_pinfo); 1224 + 1225 + err = proto_register(&mptcp_v6_prot, 1); 1226 + if (err) 1227 + return err; 1228 + 1229 + mptcp_v6_stream_ops = inet6_stream_ops; 1230 + mptcp_v6_stream_ops.bind = mptcp_bind; 1231 + mptcp_v6_stream_ops.connect = mptcp_stream_connect; 1232 + mptcp_v6_stream_ops.poll = mptcp_poll; 1233 + mptcp_v6_stream_ops.accept = mptcp_stream_accept; 1234 + mptcp_v6_stream_ops.getname = mptcp_v6_getname; 1235 + mptcp_v6_stream_ops.listen = mptcp_listen; 1236 + mptcp_v6_stream_ops.shutdown = mptcp_shutdown; 1237 + 1238 + err = inet6_register_protosw(&mptcp_v6_protosw); 1239 + if (err) 1240 + proto_unregister(&mptcp_v6_prot); 1241 + 1242 + return err; 1243 + } 1244 + #endif
+240
net/mptcp/protocol.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + /* Multipath TCP 3 + * 4 + * Copyright (c) 2017 - 2019, Intel Corporation. 5 + */ 6 + 7 + #ifndef __MPTCP_PROTOCOL_H 8 + #define __MPTCP_PROTOCOL_H 9 + 10 + #include <linux/random.h> 11 + #include <net/tcp.h> 12 + #include <net/inet_connection_sock.h> 13 + 14 + #define MPTCP_SUPPORTED_VERSION 1 15 + 16 + /* MPTCP option bits */ 17 + #define OPTION_MPTCP_MPC_SYN BIT(0) 18 + #define OPTION_MPTCP_MPC_SYNACK BIT(1) 19 + #define OPTION_MPTCP_MPC_ACK BIT(2) 20 + 21 + /* MPTCP option subtypes */ 22 + #define MPTCPOPT_MP_CAPABLE 0 23 + #define MPTCPOPT_MP_JOIN 1 24 + #define MPTCPOPT_DSS 2 25 + #define MPTCPOPT_ADD_ADDR 3 26 + #define MPTCPOPT_RM_ADDR 4 27 + #define MPTCPOPT_MP_PRIO 5 28 + #define MPTCPOPT_MP_FAIL 6 29 + #define MPTCPOPT_MP_FASTCLOSE 7 30 + 31 + /* MPTCP suboption lengths */ 32 + #define TCPOLEN_MPTCP_MPC_SYN 4 33 + #define TCPOLEN_MPTCP_MPC_SYNACK 12 34 + #define TCPOLEN_MPTCP_MPC_ACK 20 35 + #define TCPOLEN_MPTCP_MPC_ACK_DATA 22 36 + #define TCPOLEN_MPTCP_DSS_BASE 4 37 + #define TCPOLEN_MPTCP_DSS_ACK32 4 38 + #define TCPOLEN_MPTCP_DSS_ACK64 8 39 + #define TCPOLEN_MPTCP_DSS_MAP32 10 40 + #define TCPOLEN_MPTCP_DSS_MAP64 14 41 + #define TCPOLEN_MPTCP_DSS_CHECKSUM 2 42 + 43 + /* MPTCP MP_CAPABLE flags */ 44 + #define MPTCP_VERSION_MASK (0x0F) 45 + #define MPTCP_CAP_CHECKSUM_REQD BIT(7) 46 + #define MPTCP_CAP_EXTENSIBILITY BIT(6) 47 + #define MPTCP_CAP_HMAC_SHA256 BIT(0) 48 + #define MPTCP_CAP_FLAG_MASK (0x3F) 49 + 50 + /* MPTCP DSS flags */ 51 + #define MPTCP_DSS_DATA_FIN BIT(4) 52 + #define MPTCP_DSS_DSN64 BIT(3) 53 + #define MPTCP_DSS_HAS_MAP BIT(2) 54 + #define MPTCP_DSS_ACK64 BIT(1) 55 + #define MPTCP_DSS_HAS_ACK BIT(0) 56 + #define MPTCP_DSS_FLAG_MASK (0x1F) 57 + 58 + /* MPTCP socket flags */ 59 + #define MPTCP_DATA_READY BIT(0) 60 + #define MPTCP_SEND_SPACE BIT(1) 61 + 62 + /* MPTCP connection sock */ 63 + struct mptcp_sock { 64 + /* inet_connection_sock must be the first member */ 65 + struct inet_connection_sock sk; 66 + u64 local_key; 67 + u64 remote_key; 68 + u64 write_seq; 69 + u64 ack_seq; 70 + u32 token; 71 + unsigned long flags; 72 + bool can_ack; 73 + struct list_head conn_list; 74 + struct skb_ext *cached_ext; /* for the next sendmsg */ 75 + struct socket *subflow; /* outgoing connect/listener/!mp_capable */ 76 + struct sock *first; 77 + }; 78 + 79 + #define mptcp_for_each_subflow(__msk, __subflow) \ 80 + list_for_each_entry(__subflow, &((__msk)->conn_list), node) 81 + 82 + static inline struct mptcp_sock *mptcp_sk(const struct sock *sk) 83 + { 84 + return (struct mptcp_sock *)sk; 85 + } 86 + 87 + struct mptcp_subflow_request_sock { 88 + struct tcp_request_sock sk; 89 + u16 mp_capable : 1, 90 + mp_join : 1, 91 + backup : 1, 92 + remote_key_valid : 1; 93 + u64 local_key; 94 + u64 remote_key; 95 + u64 idsn; 96 + u32 token; 97 + u32 ssn_offset; 98 + }; 99 + 100 + static inline struct mptcp_subflow_request_sock * 101 + mptcp_subflow_rsk(const struct request_sock *rsk) 102 + { 103 + return (struct mptcp_subflow_request_sock *)rsk; 104 + } 105 + 106 + /* MPTCP subflow context */ 107 + struct mptcp_subflow_context { 108 + struct list_head node;/* conn_list of subflows */ 109 + u64 local_key; 110 + u64 remote_key; 111 + u64 idsn; 112 + u64 map_seq; 113 + u32 snd_isn; 114 + u32 token; 115 + u32 rel_write_seq; 116 + u32 map_subflow_seq; 117 + u32 ssn_offset; 118 + u32 map_data_len; 119 + u32 request_mptcp : 1, /* send MP_CAPABLE */ 120 + mp_capable : 1, /* remote is MPTCP capable */ 121 + fourth_ack : 1, /* send initial DSS */ 122 + conn_finished : 1, 123 + map_valid : 1, 124 + mpc_map : 1, 125 + data_avail : 1, 126 + rx_eof : 1, 127 + can_ack : 1; /* only after processing the remote a key */ 128 + 129 + struct sock *tcp_sock; /* tcp sk backpointer */ 130 + struct sock *conn; /* parent mptcp_sock */ 131 + const struct inet_connection_sock_af_ops *icsk_af_ops; 132 + void (*tcp_data_ready)(struct sock *sk); 133 + void (*tcp_state_change)(struct sock *sk); 134 + void (*tcp_write_space)(struct sock *sk); 135 + 136 + struct rcu_head rcu; 137 + }; 138 + 139 + static inline struct mptcp_subflow_context * 140 + mptcp_subflow_ctx(const struct sock *sk) 141 + { 142 + struct inet_connection_sock *icsk = inet_csk(sk); 143 + 144 + /* Use RCU on icsk_ulp_data only for sock diag code */ 145 + return (__force struct mptcp_subflow_context *)icsk->icsk_ulp_data; 146 + } 147 + 148 + static inline struct sock * 149 + mptcp_subflow_tcp_sock(const struct mptcp_subflow_context *subflow) 150 + { 151 + return subflow->tcp_sock; 152 + } 153 + 154 + static inline u64 155 + mptcp_subflow_get_map_offset(const struct mptcp_subflow_context *subflow) 156 + { 157 + return tcp_sk(mptcp_subflow_tcp_sock(subflow))->copied_seq - 158 + subflow->ssn_offset - 159 + subflow->map_subflow_seq; 160 + } 161 + 162 + static inline u64 163 + mptcp_subflow_get_mapped_dsn(const struct mptcp_subflow_context *subflow) 164 + { 165 + return subflow->map_seq + mptcp_subflow_get_map_offset(subflow); 166 + } 167 + 168 + int mptcp_is_enabled(struct net *net); 169 + bool mptcp_subflow_data_available(struct sock *sk); 170 + void mptcp_subflow_init(void); 171 + int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock); 172 + 173 + static inline void mptcp_subflow_tcp_fallback(struct sock *sk, 174 + struct mptcp_subflow_context *ctx) 175 + { 176 + sk->sk_data_ready = ctx->tcp_data_ready; 177 + sk->sk_state_change = ctx->tcp_state_change; 178 + sk->sk_write_space = ctx->tcp_write_space; 179 + 180 + inet_csk(sk)->icsk_af_ops = ctx->icsk_af_ops; 181 + } 182 + 183 + extern const struct inet_connection_sock_af_ops ipv4_specific; 184 + #if IS_ENABLED(CONFIG_MPTCP_IPV6) 185 + extern const struct inet_connection_sock_af_ops ipv6_specific; 186 + #endif 187 + 188 + void mptcp_proto_init(void); 189 + #if IS_ENABLED(CONFIG_MPTCP_IPV6) 190 + int mptcp_proto_v6_init(void); 191 + #endif 192 + 193 + struct mptcp_read_arg { 194 + struct msghdr *msg; 195 + }; 196 + 197 + int mptcp_read_actor(read_descriptor_t *desc, struct sk_buff *skb, 198 + unsigned int offset, size_t len); 199 + 200 + void mptcp_get_options(const struct sk_buff *skb, 201 + struct tcp_options_received *opt_rx); 202 + 203 + void mptcp_finish_connect(struct sock *sk); 204 + 205 + int mptcp_token_new_request(struct request_sock *req); 206 + void mptcp_token_destroy_request(u32 token); 207 + int mptcp_token_new_connect(struct sock *sk); 208 + int mptcp_token_new_accept(u32 token); 209 + void mptcp_token_update_accept(struct sock *sk, struct sock *conn); 210 + void mptcp_token_destroy(u32 token); 211 + 212 + void mptcp_crypto_key_sha(u64 key, u32 *token, u64 *idsn); 213 + static inline void mptcp_crypto_key_gen_sha(u64 *key, u32 *token, u64 *idsn) 214 + { 215 + /* we might consider a faster version that computes the key as a 216 + * hash of some information available in the MPTCP socket. Use 217 + * random data at the moment, as it's probably the safest option 218 + * in case multiple sockets are opened in different namespaces at 219 + * the same time. 220 + */ 221 + get_random_bytes(key, sizeof(u64)); 222 + mptcp_crypto_key_sha(*key, token, idsn); 223 + } 224 + 225 + void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u32 nonce1, u32 nonce2, 226 + void *hash_out); 227 + 228 + static inline struct mptcp_ext *mptcp_get_ext(struct sk_buff *skb) 229 + { 230 + return (struct mptcp_ext *)skb_ext_find(skb, SKB_EXT_MPTCP); 231 + } 232 + 233 + static inline bool before64(__u64 seq1, __u64 seq2) 234 + { 235 + return (__s64)(seq1 - seq2) < 0; 236 + } 237 + 238 + #define after64(seq2, seq1) before64(seq1, seq2) 239 + 240 + #endif /* __MPTCP_PROTOCOL_H */
+860
net/mptcp/subflow.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Multipath TCP 3 + * 4 + * Copyright (c) 2017 - 2019, Intel Corporation. 5 + */ 6 + 7 + #define pr_fmt(fmt) "MPTCP: " fmt 8 + 9 + #include <linux/kernel.h> 10 + #include <linux/module.h> 11 + #include <linux/netdevice.h> 12 + #include <net/sock.h> 13 + #include <net/inet_common.h> 14 + #include <net/inet_hashtables.h> 15 + #include <net/protocol.h> 16 + #include <net/tcp.h> 17 + #if IS_ENABLED(CONFIG_MPTCP_IPV6) 18 + #include <net/ip6_route.h> 19 + #endif 20 + #include <net/mptcp.h> 21 + #include "protocol.h" 22 + 23 + static int subflow_rebuild_header(struct sock *sk) 24 + { 25 + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 26 + int err = 0; 27 + 28 + if (subflow->request_mptcp && !subflow->token) { 29 + pr_debug("subflow=%p", sk); 30 + err = mptcp_token_new_connect(sk); 31 + } 32 + 33 + if (err) 34 + return err; 35 + 36 + return subflow->icsk_af_ops->rebuild_header(sk); 37 + } 38 + 39 + static void subflow_req_destructor(struct request_sock *req) 40 + { 41 + struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); 42 + 43 + pr_debug("subflow_req=%p", subflow_req); 44 + 45 + if (subflow_req->mp_capable) 46 + mptcp_token_destroy_request(subflow_req->token); 47 + tcp_request_sock_ops.destructor(req); 48 + } 49 + 50 + static void subflow_init_req(struct request_sock *req, 51 + const struct sock *sk_listener, 52 + struct sk_buff *skb) 53 + { 54 + struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener); 55 + struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); 56 + struct tcp_options_received rx_opt; 57 + 58 + pr_debug("subflow_req=%p, listener=%p", subflow_req, listener); 59 + 60 + memset(&rx_opt.mptcp, 0, sizeof(rx_opt.mptcp)); 61 + mptcp_get_options(skb, &rx_opt); 62 + 63 + subflow_req->mp_capable = 0; 64 + subflow_req->remote_key_valid = 0; 65 + 66 + #ifdef CONFIG_TCP_MD5SIG 67 + /* no MPTCP if MD5SIG is enabled on this socket or we may run out of 68 + * TCP option space. 69 + */ 70 + if (rcu_access_pointer(tcp_sk(sk_listener)->md5sig_info)) 71 + return; 72 + #endif 73 + 74 + if (rx_opt.mptcp.mp_capable && listener->request_mptcp) { 75 + int err; 76 + 77 + err = mptcp_token_new_request(req); 78 + if (err == 0) 79 + subflow_req->mp_capable = 1; 80 + 81 + subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq; 82 + } 83 + } 84 + 85 + static void subflow_v4_init_req(struct request_sock *req, 86 + const struct sock *sk_listener, 87 + struct sk_buff *skb) 88 + { 89 + tcp_rsk(req)->is_mptcp = 1; 90 + 91 + tcp_request_sock_ipv4_ops.init_req(req, sk_listener, skb); 92 + 93 + subflow_init_req(req, sk_listener, skb); 94 + } 95 + 96 + #if IS_ENABLED(CONFIG_MPTCP_IPV6) 97 + static void subflow_v6_init_req(struct request_sock *req, 98 + const struct sock *sk_listener, 99 + struct sk_buff *skb) 100 + { 101 + tcp_rsk(req)->is_mptcp = 1; 102 + 103 + tcp_request_sock_ipv6_ops.init_req(req, sk_listener, skb); 104 + 105 + subflow_init_req(req, sk_listener, skb); 106 + } 107 + #endif 108 + 109 + static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) 110 + { 111 + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 112 + 113 + subflow->icsk_af_ops->sk_rx_dst_set(sk, skb); 114 + 115 + if (subflow->conn && !subflow->conn_finished) { 116 + pr_debug("subflow=%p, remote_key=%llu", mptcp_subflow_ctx(sk), 117 + subflow->remote_key); 118 + mptcp_finish_connect(sk); 119 + subflow->conn_finished = 1; 120 + 121 + if (skb) { 122 + pr_debug("synack seq=%u", TCP_SKB_CB(skb)->seq); 123 + subflow->ssn_offset = TCP_SKB_CB(skb)->seq; 124 + } 125 + } 126 + } 127 + 128 + static struct request_sock_ops subflow_request_sock_ops; 129 + static struct tcp_request_sock_ops subflow_request_sock_ipv4_ops; 130 + 131 + static int subflow_v4_conn_request(struct sock *sk, struct sk_buff *skb) 132 + { 133 + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 134 + 135 + pr_debug("subflow=%p", subflow); 136 + 137 + /* Never answer to SYNs sent to broadcast or multicast */ 138 + if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 139 + goto drop; 140 + 141 + return tcp_conn_request(&subflow_request_sock_ops, 142 + &subflow_request_sock_ipv4_ops, 143 + sk, skb); 144 + drop: 145 + tcp_listendrop(sk); 146 + return 0; 147 + } 148 + 149 + #if IS_ENABLED(CONFIG_MPTCP_IPV6) 150 + static struct tcp_request_sock_ops subflow_request_sock_ipv6_ops; 151 + static struct inet_connection_sock_af_ops subflow_v6_specific; 152 + static struct inet_connection_sock_af_ops subflow_v6m_specific; 153 + 154 + static int subflow_v6_conn_request(struct sock *sk, struct sk_buff *skb) 155 + { 156 + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 157 + 158 + pr_debug("subflow=%p", subflow); 159 + 160 + if (skb->protocol == htons(ETH_P_IP)) 161 + return subflow_v4_conn_request(sk, skb); 162 + 163 + if (!ipv6_unicast_destination(skb)) 164 + goto drop; 165 + 166 + return tcp_conn_request(&subflow_request_sock_ops, 167 + &subflow_request_sock_ipv6_ops, sk, skb); 168 + 169 + drop: 170 + tcp_listendrop(sk); 171 + return 0; /* don't send reset */ 172 + } 173 + #endif 174 + 175 + static struct sock *subflow_syn_recv_sock(const struct sock *sk, 176 + struct sk_buff *skb, 177 + struct request_sock *req, 178 + struct dst_entry *dst, 179 + struct request_sock *req_unhash, 180 + bool *own_req) 181 + { 182 + struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk); 183 + struct mptcp_subflow_request_sock *subflow_req; 184 + struct tcp_options_received opt_rx; 185 + struct sock *child; 186 + 187 + pr_debug("listener=%p, req=%p, conn=%p", listener, req, listener->conn); 188 + 189 + /* if the sk is MP_CAPABLE, we try to fetch the client key */ 190 + subflow_req = mptcp_subflow_rsk(req); 191 + if (subflow_req->mp_capable) { 192 + if (TCP_SKB_CB(skb)->seq != subflow_req->ssn_offset + 1) { 193 + /* here we can receive and accept an in-window, 194 + * out-of-order pkt, which will not carry the MP_CAPABLE 195 + * opt even on mptcp enabled paths 196 + */ 197 + goto create_child; 198 + } 199 + 200 + opt_rx.mptcp.mp_capable = 0; 201 + mptcp_get_options(skb, &opt_rx); 202 + if (opt_rx.mptcp.mp_capable) { 203 + subflow_req->remote_key = opt_rx.mptcp.sndr_key; 204 + subflow_req->remote_key_valid = 1; 205 + } else { 206 + subflow_req->mp_capable = 0; 207 + } 208 + } 209 + 210 + create_child: 211 + child = listener->icsk_af_ops->syn_recv_sock(sk, skb, req, dst, 212 + req_unhash, own_req); 213 + 214 + if (child && *own_req) { 215 + struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(child); 216 + 217 + /* we have null ctx on TCP fallback, not fatal on MPC 218 + * handshake 219 + */ 220 + if (!ctx) 221 + return child; 222 + 223 + if (ctx->mp_capable) { 224 + if (mptcp_token_new_accept(ctx->token)) 225 + goto close_child; 226 + } 227 + } 228 + 229 + return child; 230 + 231 + close_child: 232 + pr_debug("closing child socket"); 233 + tcp_send_active_reset(child, GFP_ATOMIC); 234 + inet_csk_prepare_forced_close(child); 235 + tcp_done(child); 236 + return NULL; 237 + } 238 + 239 + static struct inet_connection_sock_af_ops subflow_specific; 240 + 241 + enum mapping_status { 242 + MAPPING_OK, 243 + MAPPING_INVALID, 244 + MAPPING_EMPTY, 245 + MAPPING_DATA_FIN 246 + }; 247 + 248 + static u64 expand_seq(u64 old_seq, u16 old_data_len, u64 seq) 249 + { 250 + if ((u32)seq == (u32)old_seq) 251 + return old_seq; 252 + 253 + /* Assume map covers data not mapped yet. */ 254 + return seq | ((old_seq + old_data_len + 1) & GENMASK_ULL(63, 32)); 255 + } 256 + 257 + static void warn_bad_map(struct mptcp_subflow_context *subflow, u32 ssn) 258 + { 259 + WARN_ONCE(1, "Bad mapping: ssn=%d map_seq=%d map_data_len=%d", 260 + ssn, subflow->map_subflow_seq, subflow->map_data_len); 261 + } 262 + 263 + static bool skb_is_fully_mapped(struct sock *ssk, struct sk_buff *skb) 264 + { 265 + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 266 + unsigned int skb_consumed; 267 + 268 + skb_consumed = tcp_sk(ssk)->copied_seq - TCP_SKB_CB(skb)->seq; 269 + if (WARN_ON_ONCE(skb_consumed >= skb->len)) 270 + return true; 271 + 272 + return skb->len - skb_consumed <= subflow->map_data_len - 273 + mptcp_subflow_get_map_offset(subflow); 274 + } 275 + 276 + static bool validate_mapping(struct sock *ssk, struct sk_buff *skb) 277 + { 278 + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 279 + u32 ssn = tcp_sk(ssk)->copied_seq - subflow->ssn_offset; 280 + 281 + if (unlikely(before(ssn, subflow->map_subflow_seq))) { 282 + /* Mapping covers data later in the subflow stream, 283 + * currently unsupported. 284 + */ 285 + warn_bad_map(subflow, ssn); 286 + return false; 287 + } 288 + if (unlikely(!before(ssn, subflow->map_subflow_seq + 289 + subflow->map_data_len))) { 290 + /* Mapping does covers past subflow data, invalid */ 291 + warn_bad_map(subflow, ssn + skb->len); 292 + return false; 293 + } 294 + return true; 295 + } 296 + 297 + static enum mapping_status get_mapping_status(struct sock *ssk) 298 + { 299 + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 300 + struct mptcp_ext *mpext; 301 + struct sk_buff *skb; 302 + u16 data_len; 303 + u64 map_seq; 304 + 305 + skb = skb_peek(&ssk->sk_receive_queue); 306 + if (!skb) 307 + return MAPPING_EMPTY; 308 + 309 + mpext = mptcp_get_ext(skb); 310 + if (!mpext || !mpext->use_map) { 311 + if (!subflow->map_valid && !skb->len) { 312 + /* the TCP stack deliver 0 len FIN pkt to the receive 313 + * queue, that is the only 0len pkts ever expected here, 314 + * and we can admit no mapping only for 0 len pkts 315 + */ 316 + if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) 317 + WARN_ONCE(1, "0len seq %d:%d flags %x", 318 + TCP_SKB_CB(skb)->seq, 319 + TCP_SKB_CB(skb)->end_seq, 320 + TCP_SKB_CB(skb)->tcp_flags); 321 + sk_eat_skb(ssk, skb); 322 + return MAPPING_EMPTY; 323 + } 324 + 325 + if (!subflow->map_valid) 326 + return MAPPING_INVALID; 327 + 328 + goto validate_seq; 329 + } 330 + 331 + pr_debug("seq=%llu is64=%d ssn=%u data_len=%u data_fin=%d", 332 + mpext->data_seq, mpext->dsn64, mpext->subflow_seq, 333 + mpext->data_len, mpext->data_fin); 334 + 335 + data_len = mpext->data_len; 336 + if (data_len == 0) { 337 + pr_err("Infinite mapping not handled"); 338 + return MAPPING_INVALID; 339 + } 340 + 341 + if (mpext->data_fin == 1) { 342 + if (data_len == 1) { 343 + pr_debug("DATA_FIN with no payload"); 344 + if (subflow->map_valid) { 345 + /* A DATA_FIN might arrive in a DSS 346 + * option before the previous mapping 347 + * has been fully consumed. Continue 348 + * handling the existing mapping. 349 + */ 350 + skb_ext_del(skb, SKB_EXT_MPTCP); 351 + return MAPPING_OK; 352 + } else { 353 + return MAPPING_DATA_FIN; 354 + } 355 + } 356 + 357 + /* Adjust for DATA_FIN using 1 byte of sequence space */ 358 + data_len--; 359 + } 360 + 361 + if (!mpext->dsn64) { 362 + map_seq = expand_seq(subflow->map_seq, subflow->map_data_len, 363 + mpext->data_seq); 364 + pr_debug("expanded seq=%llu", subflow->map_seq); 365 + } else { 366 + map_seq = mpext->data_seq; 367 + } 368 + 369 + if (subflow->map_valid) { 370 + /* Allow replacing only with an identical map */ 371 + if (subflow->map_seq == map_seq && 372 + subflow->map_subflow_seq == mpext->subflow_seq && 373 + subflow->map_data_len == data_len) { 374 + skb_ext_del(skb, SKB_EXT_MPTCP); 375 + return MAPPING_OK; 376 + } 377 + 378 + /* If this skb data are fully covered by the current mapping, 379 + * the new map would need caching, which is not supported 380 + */ 381 + if (skb_is_fully_mapped(ssk, skb)) 382 + return MAPPING_INVALID; 383 + 384 + /* will validate the next map after consuming the current one */ 385 + return MAPPING_OK; 386 + } 387 + 388 + subflow->map_seq = map_seq; 389 + subflow->map_subflow_seq = mpext->subflow_seq; 390 + subflow->map_data_len = data_len; 391 + subflow->map_valid = 1; 392 + subflow->mpc_map = mpext->mpc_map; 393 + pr_debug("new map seq=%llu subflow_seq=%u data_len=%u", 394 + subflow->map_seq, subflow->map_subflow_seq, 395 + subflow->map_data_len); 396 + 397 + validate_seq: 398 + /* we revalidate valid mapping on new skb, because we must ensure 399 + * the current skb is completely covered by the available mapping 400 + */ 401 + if (!validate_mapping(ssk, skb)) 402 + return MAPPING_INVALID; 403 + 404 + skb_ext_del(skb, SKB_EXT_MPTCP); 405 + return MAPPING_OK; 406 + } 407 + 408 + static bool subflow_check_data_avail(struct sock *ssk) 409 + { 410 + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 411 + enum mapping_status status; 412 + struct mptcp_sock *msk; 413 + struct sk_buff *skb; 414 + 415 + pr_debug("msk=%p ssk=%p data_avail=%d skb=%p", subflow->conn, ssk, 416 + subflow->data_avail, skb_peek(&ssk->sk_receive_queue)); 417 + if (subflow->data_avail) 418 + return true; 419 + 420 + if (!subflow->conn) 421 + return false; 422 + 423 + msk = mptcp_sk(subflow->conn); 424 + for (;;) { 425 + u32 map_remaining; 426 + size_t delta; 427 + u64 ack_seq; 428 + u64 old_ack; 429 + 430 + status = get_mapping_status(ssk); 431 + pr_debug("msk=%p ssk=%p status=%d", msk, ssk, status); 432 + if (status == MAPPING_INVALID) { 433 + ssk->sk_err = EBADMSG; 434 + goto fatal; 435 + } 436 + 437 + if (status != MAPPING_OK) 438 + return false; 439 + 440 + skb = skb_peek(&ssk->sk_receive_queue); 441 + if (WARN_ON_ONCE(!skb)) 442 + return false; 443 + 444 + /* if msk lacks the remote key, this subflow must provide an 445 + * MP_CAPABLE-based mapping 446 + */ 447 + if (unlikely(!READ_ONCE(msk->can_ack))) { 448 + if (!subflow->mpc_map) { 449 + ssk->sk_err = EBADMSG; 450 + goto fatal; 451 + } 452 + WRITE_ONCE(msk->remote_key, subflow->remote_key); 453 + WRITE_ONCE(msk->ack_seq, subflow->map_seq); 454 + WRITE_ONCE(msk->can_ack, true); 455 + } 456 + 457 + old_ack = READ_ONCE(msk->ack_seq); 458 + ack_seq = mptcp_subflow_get_mapped_dsn(subflow); 459 + pr_debug("msk ack_seq=%llx subflow ack_seq=%llx", old_ack, 460 + ack_seq); 461 + if (ack_seq == old_ack) 462 + break; 463 + 464 + /* only accept in-sequence mapping. Old values are spurious 465 + * retransmission; we can hit "future" values on active backup 466 + * subflow switch, we relay on retransmissions to get 467 + * in-sequence data. 468 + * Cuncurrent subflows support will require subflow data 469 + * reordering 470 + */ 471 + map_remaining = subflow->map_data_len - 472 + mptcp_subflow_get_map_offset(subflow); 473 + if (before64(ack_seq, old_ack)) 474 + delta = min_t(size_t, old_ack - ack_seq, map_remaining); 475 + else 476 + delta = min_t(size_t, ack_seq - old_ack, map_remaining); 477 + 478 + /* discard mapped data */ 479 + pr_debug("discarding %zu bytes, current map len=%d", delta, 480 + map_remaining); 481 + if (delta) { 482 + struct mptcp_read_arg arg = { 483 + .msg = NULL, 484 + }; 485 + read_descriptor_t desc = { 486 + .count = delta, 487 + .arg.data = &arg, 488 + }; 489 + int ret; 490 + 491 + ret = tcp_read_sock(ssk, &desc, mptcp_read_actor); 492 + if (ret < 0) { 493 + ssk->sk_err = -ret; 494 + goto fatal; 495 + } 496 + if (ret < delta) 497 + return false; 498 + if (delta == map_remaining) 499 + subflow->map_valid = 0; 500 + } 501 + } 502 + return true; 503 + 504 + fatal: 505 + /* fatal protocol error, close the socket */ 506 + /* This barrier is coupled with smp_rmb() in tcp_poll() */ 507 + smp_wmb(); 508 + ssk->sk_error_report(ssk); 509 + tcp_set_state(ssk, TCP_CLOSE); 510 + tcp_send_active_reset(ssk, GFP_ATOMIC); 511 + return false; 512 + } 513 + 514 + bool mptcp_subflow_data_available(struct sock *sk) 515 + { 516 + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 517 + struct sk_buff *skb; 518 + 519 + /* check if current mapping is still valid */ 520 + if (subflow->map_valid && 521 + mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) { 522 + subflow->map_valid = 0; 523 + subflow->data_avail = 0; 524 + 525 + pr_debug("Done with mapping: seq=%u data_len=%u", 526 + subflow->map_subflow_seq, 527 + subflow->map_data_len); 528 + } 529 + 530 + if (!subflow_check_data_avail(sk)) { 531 + subflow->data_avail = 0; 532 + return false; 533 + } 534 + 535 + skb = skb_peek(&sk->sk_receive_queue); 536 + subflow->data_avail = skb && 537 + before(tcp_sk(sk)->copied_seq, TCP_SKB_CB(skb)->end_seq); 538 + return subflow->data_avail; 539 + } 540 + 541 + static void subflow_data_ready(struct sock *sk) 542 + { 543 + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 544 + struct sock *parent = subflow->conn; 545 + 546 + if (!parent || !subflow->mp_capable) { 547 + subflow->tcp_data_ready(sk); 548 + 549 + if (parent) 550 + parent->sk_data_ready(parent); 551 + return; 552 + } 553 + 554 + if (mptcp_subflow_data_available(sk)) { 555 + set_bit(MPTCP_DATA_READY, &mptcp_sk(parent)->flags); 556 + 557 + parent->sk_data_ready(parent); 558 + } 559 + } 560 + 561 + static void subflow_write_space(struct sock *sk) 562 + { 563 + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 564 + struct sock *parent = subflow->conn; 565 + 566 + sk_stream_write_space(sk); 567 + if (parent && sk_stream_is_writeable(sk)) { 568 + set_bit(MPTCP_SEND_SPACE, &mptcp_sk(parent)->flags); 569 + smp_mb__after_atomic(); 570 + /* set SEND_SPACE before sk_stream_write_space clears NOSPACE */ 571 + sk_stream_write_space(parent); 572 + } 573 + } 574 + 575 + static struct inet_connection_sock_af_ops * 576 + subflow_default_af_ops(struct sock *sk) 577 + { 578 + #if IS_ENABLED(CONFIG_MPTCP_IPV6) 579 + if (sk->sk_family == AF_INET6) 580 + return &subflow_v6_specific; 581 + #endif 582 + return &subflow_specific; 583 + } 584 + 585 + void mptcp_handle_ipv6_mapped(struct sock *sk, bool mapped) 586 + { 587 + #if IS_ENABLED(CONFIG_MPTCP_IPV6) 588 + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 589 + struct inet_connection_sock *icsk = inet_csk(sk); 590 + struct inet_connection_sock_af_ops *target; 591 + 592 + target = mapped ? &subflow_v6m_specific : subflow_default_af_ops(sk); 593 + 594 + pr_debug("subflow=%p family=%d ops=%p target=%p mapped=%d", 595 + subflow, sk->sk_family, icsk->icsk_af_ops, target, mapped); 596 + 597 + if (likely(icsk->icsk_af_ops == target)) 598 + return; 599 + 600 + subflow->icsk_af_ops = icsk->icsk_af_ops; 601 + icsk->icsk_af_ops = target; 602 + #endif 603 + } 604 + 605 + int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock) 606 + { 607 + struct mptcp_subflow_context *subflow; 608 + struct net *net = sock_net(sk); 609 + struct socket *sf; 610 + int err; 611 + 612 + err = sock_create_kern(net, sk->sk_family, SOCK_STREAM, IPPROTO_TCP, 613 + &sf); 614 + if (err) 615 + return err; 616 + 617 + lock_sock(sf->sk); 618 + 619 + /* kernel sockets do not by default acquire net ref, but TCP timer 620 + * needs it. 621 + */ 622 + sf->sk->sk_net_refcnt = 1; 623 + get_net(net); 624 + this_cpu_add(*net->core.sock_inuse, 1); 625 + err = tcp_set_ulp(sf->sk, "mptcp"); 626 + release_sock(sf->sk); 627 + 628 + if (err) 629 + return err; 630 + 631 + subflow = mptcp_subflow_ctx(sf->sk); 632 + pr_debug("subflow=%p", subflow); 633 + 634 + *new_sock = sf; 635 + sock_hold(sk); 636 + subflow->conn = sk; 637 + 638 + return 0; 639 + } 640 + 641 + static struct mptcp_subflow_context *subflow_create_ctx(struct sock *sk, 642 + gfp_t priority) 643 + { 644 + struct inet_connection_sock *icsk = inet_csk(sk); 645 + struct mptcp_subflow_context *ctx; 646 + 647 + ctx = kzalloc(sizeof(*ctx), priority); 648 + if (!ctx) 649 + return NULL; 650 + 651 + rcu_assign_pointer(icsk->icsk_ulp_data, ctx); 652 + INIT_LIST_HEAD(&ctx->node); 653 + 654 + pr_debug("subflow=%p", ctx); 655 + 656 + ctx->tcp_sock = sk; 657 + 658 + return ctx; 659 + } 660 + 661 + static void __subflow_state_change(struct sock *sk) 662 + { 663 + struct socket_wq *wq; 664 + 665 + rcu_read_lock(); 666 + wq = rcu_dereference(sk->sk_wq); 667 + if (skwq_has_sleeper(wq)) 668 + wake_up_interruptible_all(&wq->wait); 669 + rcu_read_unlock(); 670 + } 671 + 672 + static bool subflow_is_done(const struct sock *sk) 673 + { 674 + return sk->sk_shutdown & RCV_SHUTDOWN || sk->sk_state == TCP_CLOSE; 675 + } 676 + 677 + static void subflow_state_change(struct sock *sk) 678 + { 679 + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 680 + struct sock *parent = READ_ONCE(subflow->conn); 681 + 682 + __subflow_state_change(sk); 683 + 684 + /* as recvmsg() does not acquire the subflow socket for ssk selection 685 + * a fin packet carrying a DSS can be unnoticed if we don't trigger 686 + * the data available machinery here. 687 + */ 688 + if (parent && subflow->mp_capable && mptcp_subflow_data_available(sk)) { 689 + set_bit(MPTCP_DATA_READY, &mptcp_sk(parent)->flags); 690 + 691 + parent->sk_data_ready(parent); 692 + } 693 + 694 + if (parent && !(parent->sk_shutdown & RCV_SHUTDOWN) && 695 + !subflow->rx_eof && subflow_is_done(sk)) { 696 + subflow->rx_eof = 1; 697 + parent->sk_shutdown |= RCV_SHUTDOWN; 698 + __subflow_state_change(parent); 699 + } 700 + } 701 + 702 + static int subflow_ulp_init(struct sock *sk) 703 + { 704 + struct inet_connection_sock *icsk = inet_csk(sk); 705 + struct mptcp_subflow_context *ctx; 706 + struct tcp_sock *tp = tcp_sk(sk); 707 + int err = 0; 708 + 709 + /* disallow attaching ULP to a socket unless it has been 710 + * created with sock_create_kern() 711 + */ 712 + if (!sk->sk_kern_sock) { 713 + err = -EOPNOTSUPP; 714 + goto out; 715 + } 716 + 717 + ctx = subflow_create_ctx(sk, GFP_KERNEL); 718 + if (!ctx) { 719 + err = -ENOMEM; 720 + goto out; 721 + } 722 + 723 + pr_debug("subflow=%p, family=%d", ctx, sk->sk_family); 724 + 725 + tp->is_mptcp = 1; 726 + ctx->icsk_af_ops = icsk->icsk_af_ops; 727 + icsk->icsk_af_ops = subflow_default_af_ops(sk); 728 + ctx->tcp_data_ready = sk->sk_data_ready; 729 + ctx->tcp_state_change = sk->sk_state_change; 730 + ctx->tcp_write_space = sk->sk_write_space; 731 + sk->sk_data_ready = subflow_data_ready; 732 + sk->sk_write_space = subflow_write_space; 733 + sk->sk_state_change = subflow_state_change; 734 + out: 735 + return err; 736 + } 737 + 738 + static void subflow_ulp_release(struct sock *sk) 739 + { 740 + struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(sk); 741 + 742 + if (!ctx) 743 + return; 744 + 745 + if (ctx->conn) 746 + sock_put(ctx->conn); 747 + 748 + kfree_rcu(ctx, rcu); 749 + } 750 + 751 + static void subflow_ulp_fallback(struct sock *sk, 752 + struct mptcp_subflow_context *old_ctx) 753 + { 754 + struct inet_connection_sock *icsk = inet_csk(sk); 755 + 756 + mptcp_subflow_tcp_fallback(sk, old_ctx); 757 + icsk->icsk_ulp_ops = NULL; 758 + rcu_assign_pointer(icsk->icsk_ulp_data, NULL); 759 + tcp_sk(sk)->is_mptcp = 0; 760 + } 761 + 762 + static void subflow_ulp_clone(const struct request_sock *req, 763 + struct sock *newsk, 764 + const gfp_t priority) 765 + { 766 + struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); 767 + struct mptcp_subflow_context *old_ctx = mptcp_subflow_ctx(newsk); 768 + struct mptcp_subflow_context *new_ctx; 769 + 770 + if (!subflow_req->mp_capable) { 771 + subflow_ulp_fallback(newsk, old_ctx); 772 + return; 773 + } 774 + 775 + new_ctx = subflow_create_ctx(newsk, priority); 776 + if (new_ctx == NULL) { 777 + subflow_ulp_fallback(newsk, old_ctx); 778 + return; 779 + } 780 + 781 + /* see comments in subflow_syn_recv_sock(), MPTCP connection is fully 782 + * established only after we receive the remote key 783 + */ 784 + new_ctx->conn_finished = 1; 785 + new_ctx->icsk_af_ops = old_ctx->icsk_af_ops; 786 + new_ctx->tcp_data_ready = old_ctx->tcp_data_ready; 787 + new_ctx->tcp_state_change = old_ctx->tcp_state_change; 788 + new_ctx->tcp_write_space = old_ctx->tcp_write_space; 789 + new_ctx->mp_capable = 1; 790 + new_ctx->fourth_ack = subflow_req->remote_key_valid; 791 + new_ctx->can_ack = subflow_req->remote_key_valid; 792 + new_ctx->remote_key = subflow_req->remote_key; 793 + new_ctx->local_key = subflow_req->local_key; 794 + new_ctx->token = subflow_req->token; 795 + new_ctx->ssn_offset = subflow_req->ssn_offset; 796 + new_ctx->idsn = subflow_req->idsn; 797 + } 798 + 799 + static struct tcp_ulp_ops subflow_ulp_ops __read_mostly = { 800 + .name = "mptcp", 801 + .owner = THIS_MODULE, 802 + .init = subflow_ulp_init, 803 + .release = subflow_ulp_release, 804 + .clone = subflow_ulp_clone, 805 + }; 806 + 807 + static int subflow_ops_init(struct request_sock_ops *subflow_ops) 808 + { 809 + subflow_ops->obj_size = sizeof(struct mptcp_subflow_request_sock); 810 + subflow_ops->slab_name = "request_sock_subflow"; 811 + 812 + subflow_ops->slab = kmem_cache_create(subflow_ops->slab_name, 813 + subflow_ops->obj_size, 0, 814 + SLAB_ACCOUNT | 815 + SLAB_TYPESAFE_BY_RCU, 816 + NULL); 817 + if (!subflow_ops->slab) 818 + return -ENOMEM; 819 + 820 + subflow_ops->destructor = subflow_req_destructor; 821 + 822 + return 0; 823 + } 824 + 825 + void mptcp_subflow_init(void) 826 + { 827 + subflow_request_sock_ops = tcp_request_sock_ops; 828 + if (subflow_ops_init(&subflow_request_sock_ops) != 0) 829 + panic("MPTCP: failed to init subflow request sock ops\n"); 830 + 831 + subflow_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops; 832 + subflow_request_sock_ipv4_ops.init_req = subflow_v4_init_req; 833 + 834 + subflow_specific = ipv4_specific; 835 + subflow_specific.conn_request = subflow_v4_conn_request; 836 + subflow_specific.syn_recv_sock = subflow_syn_recv_sock; 837 + subflow_specific.sk_rx_dst_set = subflow_finish_connect; 838 + subflow_specific.rebuild_header = subflow_rebuild_header; 839 + 840 + #if IS_ENABLED(CONFIG_MPTCP_IPV6) 841 + subflow_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops; 842 + subflow_request_sock_ipv6_ops.init_req = subflow_v6_init_req; 843 + 844 + subflow_v6_specific = ipv6_specific; 845 + subflow_v6_specific.conn_request = subflow_v6_conn_request; 846 + subflow_v6_specific.syn_recv_sock = subflow_syn_recv_sock; 847 + subflow_v6_specific.sk_rx_dst_set = subflow_finish_connect; 848 + subflow_v6_specific.rebuild_header = subflow_rebuild_header; 849 + 850 + subflow_v6m_specific = subflow_v6_specific; 851 + subflow_v6m_specific.queue_xmit = ipv4_specific.queue_xmit; 852 + subflow_v6m_specific.send_check = ipv4_specific.send_check; 853 + subflow_v6m_specific.net_header_len = ipv4_specific.net_header_len; 854 + subflow_v6m_specific.mtu_reduced = ipv4_specific.mtu_reduced; 855 + subflow_v6m_specific.net_frag_header_len = 0; 856 + #endif 857 + 858 + if (tcp_register_ulp(&subflow_ulp_ops) != 0) 859 + panic("MPTCP: failed to register subflows to ULP\n"); 860 + }
+195
net/mptcp/token.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Multipath TCP token management 3 + * Copyright (c) 2017 - 2019, Intel Corporation. 4 + * 5 + * Note: This code is based on mptcp_ctrl.c from multipath-tcp.org, 6 + * authored by: 7 + * 8 + * Sébastien Barré <sebastien.barre@uclouvain.be> 9 + * Christoph Paasch <christoph.paasch@uclouvain.be> 10 + * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi> 11 + * Gregory Detal <gregory.detal@uclouvain.be> 12 + * Fabien Duchêne <fabien.duchene@uclouvain.be> 13 + * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de> 14 + * Lavkesh Lahngir <lavkesh51@gmail.com> 15 + * Andreas Ripke <ripke@neclab.eu> 16 + * Vlad Dogaru <vlad.dogaru@intel.com> 17 + * Octavian Purdila <octavian.purdila@intel.com> 18 + * John Ronan <jronan@tssg.org> 19 + * Catalin Nicutar <catalin.nicutar@gmail.com> 20 + * Brandon Heller <brandonh@stanford.edu> 21 + */ 22 + 23 + #define pr_fmt(fmt) "MPTCP: " fmt 24 + 25 + #include <linux/kernel.h> 26 + #include <linux/module.h> 27 + #include <linux/radix-tree.h> 28 + #include <linux/ip.h> 29 + #include <linux/tcp.h> 30 + #include <net/sock.h> 31 + #include <net/inet_common.h> 32 + #include <net/protocol.h> 33 + #include <net/mptcp.h> 34 + #include "protocol.h" 35 + 36 + static RADIX_TREE(token_tree, GFP_ATOMIC); 37 + static RADIX_TREE(token_req_tree, GFP_ATOMIC); 38 + static DEFINE_SPINLOCK(token_tree_lock); 39 + static int token_used __read_mostly; 40 + 41 + /** 42 + * mptcp_token_new_request - create new key/idsn/token for subflow_request 43 + * @req - the request socket 44 + * 45 + * This function is called when a new mptcp connection is coming in. 46 + * 47 + * It creates a unique token to identify the new mptcp connection, 48 + * a secret local key and the initial data sequence number (idsn). 49 + * 50 + * Returns 0 on success. 51 + */ 52 + int mptcp_token_new_request(struct request_sock *req) 53 + { 54 + struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); 55 + int err; 56 + 57 + while (1) { 58 + u32 token; 59 + 60 + mptcp_crypto_key_gen_sha(&subflow_req->local_key, 61 + &subflow_req->token, 62 + &subflow_req->idsn); 63 + pr_debug("req=%p local_key=%llu, token=%u, idsn=%llu\n", 64 + req, subflow_req->local_key, subflow_req->token, 65 + subflow_req->idsn); 66 + 67 + token = subflow_req->token; 68 + spin_lock_bh(&token_tree_lock); 69 + if (!radix_tree_lookup(&token_req_tree, token) && 70 + !radix_tree_lookup(&token_tree, token)) 71 + break; 72 + spin_unlock_bh(&token_tree_lock); 73 + } 74 + 75 + err = radix_tree_insert(&token_req_tree, 76 + subflow_req->token, &token_used); 77 + spin_unlock_bh(&token_tree_lock); 78 + return err; 79 + } 80 + 81 + /** 82 + * mptcp_token_new_connect - create new key/idsn/token for subflow 83 + * @sk - the socket that will initiate a connection 84 + * 85 + * This function is called when a new outgoing mptcp connection is 86 + * initiated. 87 + * 88 + * It creates a unique token to identify the new mptcp connection, 89 + * a secret local key and the initial data sequence number (idsn). 90 + * 91 + * On success, the mptcp connection can be found again using 92 + * the computed token at a later time, this is needed to process 93 + * join requests. 94 + * 95 + * returns 0 on success. 96 + */ 97 + int mptcp_token_new_connect(struct sock *sk) 98 + { 99 + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 100 + struct sock *mptcp_sock = subflow->conn; 101 + int err; 102 + 103 + while (1) { 104 + u32 token; 105 + 106 + mptcp_crypto_key_gen_sha(&subflow->local_key, &subflow->token, 107 + &subflow->idsn); 108 + 109 + pr_debug("ssk=%p, local_key=%llu, token=%u, idsn=%llu\n", 110 + sk, subflow->local_key, subflow->token, subflow->idsn); 111 + 112 + token = subflow->token; 113 + spin_lock_bh(&token_tree_lock); 114 + if (!radix_tree_lookup(&token_req_tree, token) && 115 + !radix_tree_lookup(&token_tree, token)) 116 + break; 117 + spin_unlock_bh(&token_tree_lock); 118 + } 119 + err = radix_tree_insert(&token_tree, subflow->token, mptcp_sock); 120 + spin_unlock_bh(&token_tree_lock); 121 + 122 + return err; 123 + } 124 + 125 + /** 126 + * mptcp_token_new_accept - insert token for later processing 127 + * @token: the token to insert to the tree 128 + * 129 + * Called when a SYN packet creates a new logical connection, i.e. 130 + * is not a join request. 131 + * 132 + * We don't have an mptcp socket yet at that point. 133 + * This is paired with mptcp_token_update_accept, called on accept(). 134 + */ 135 + int mptcp_token_new_accept(u32 token) 136 + { 137 + int err; 138 + 139 + spin_lock_bh(&token_tree_lock); 140 + err = radix_tree_insert(&token_tree, token, &token_used); 141 + spin_unlock_bh(&token_tree_lock); 142 + 143 + return err; 144 + } 145 + 146 + /** 147 + * mptcp_token_update_accept - update token to map to mptcp socket 148 + * @conn: the new struct mptcp_sock 149 + * @sk: the initial subflow for this mptcp socket 150 + * 151 + * Called when the first mptcp socket is created on accept to 152 + * refresh the dummy mapping (done to reserve the token) with 153 + * the mptcp_socket structure that wasn't allocated before. 154 + */ 155 + void mptcp_token_update_accept(struct sock *sk, struct sock *conn) 156 + { 157 + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 158 + void __rcu **slot; 159 + 160 + spin_lock_bh(&token_tree_lock); 161 + slot = radix_tree_lookup_slot(&token_tree, subflow->token); 162 + WARN_ON_ONCE(!slot); 163 + if (slot) { 164 + WARN_ON_ONCE(rcu_access_pointer(*slot) != &token_used); 165 + radix_tree_replace_slot(&token_tree, slot, conn); 166 + } 167 + spin_unlock_bh(&token_tree_lock); 168 + } 169 + 170 + /** 171 + * mptcp_token_destroy_request - remove mptcp connection/token 172 + * @token - token of mptcp connection to remove 173 + * 174 + * Remove not-yet-fully-established incoming connection identified 175 + * by @token. 176 + */ 177 + void mptcp_token_destroy_request(u32 token) 178 + { 179 + spin_lock_bh(&token_tree_lock); 180 + radix_tree_delete(&token_req_tree, token); 181 + spin_unlock_bh(&token_tree_lock); 182 + } 183 + 184 + /** 185 + * mptcp_token_destroy - remove mptcp connection/token 186 + * @token - token of mptcp connection to remove 187 + * 188 + * Remove the connection identified by @token. 189 + */ 190 + void mptcp_token_destroy(u32 token) 191 + { 192 + spin_lock_bh(&token_tree_lock); 193 + radix_tree_delete(&token_tree, token); 194 + spin_unlock_bh(&token_tree_lock); 195 + }
+1
tools/testing/selftests/Makefile
··· 32 32 TARGETS += mount 33 33 TARGETS += mqueue 34 34 TARGETS += net 35 + TARGETS += net/mptcp 35 36 TARGETS += netfilter 36 37 TARGETS += networking/timestamping 37 38 TARGETS += nsfs
+2
tools/testing/selftests/net/mptcp/.gitignore
··· 1 + mptcp_connect 2 + *.pcap
+13
tools/testing/selftests/net/mptcp/Makefile
··· 1 + # SPDX-License-Identifier: GPL-2.0 2 + 3 + top_srcdir = ../../../../.. 4 + 5 + CFLAGS = -Wall -Wl,--no-as-needed -O2 -g 6 + 7 + TEST_PROGS := mptcp_connect.sh 8 + 9 + TEST_GEN_FILES = mptcp_connect 10 + 11 + EXTRA_CLEAN := *.pcap 12 + 13 + include ../../lib.mk
+4
tools/testing/selftests/net/mptcp/config
··· 1 + CONFIG_MPTCP=y 2 + CONFIG_MPTCP_IPV6=y 3 + CONFIG_VETH=y 4 + CONFIG_NET_SCH_NETEM=m
+832
tools/testing/selftests/net/mptcp/mptcp_connect.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + 3 + #define _GNU_SOURCE 4 + 5 + #include <errno.h> 6 + #include <limits.h> 7 + #include <fcntl.h> 8 + #include <string.h> 9 + #include <stdbool.h> 10 + #include <stdint.h> 11 + #include <stdio.h> 12 + #include <stdlib.h> 13 + #include <strings.h> 14 + #include <unistd.h> 15 + 16 + #include <sys/poll.h> 17 + #include <sys/sendfile.h> 18 + #include <sys/stat.h> 19 + #include <sys/socket.h> 20 + #include <sys/types.h> 21 + #include <sys/mman.h> 22 + 23 + #include <netdb.h> 24 + #include <netinet/in.h> 25 + 26 + #include <linux/tcp.h> 27 + 28 + extern int optind; 29 + 30 + #ifndef IPPROTO_MPTCP 31 + #define IPPROTO_MPTCP 262 32 + #endif 33 + #ifndef TCP_ULP 34 + #define TCP_ULP 31 35 + #endif 36 + 37 + static bool listen_mode; 38 + static int poll_timeout; 39 + 40 + enum cfg_mode { 41 + CFG_MODE_POLL, 42 + CFG_MODE_MMAP, 43 + CFG_MODE_SENDFILE, 44 + }; 45 + 46 + static enum cfg_mode cfg_mode = CFG_MODE_POLL; 47 + static const char *cfg_host; 48 + static const char *cfg_port = "12000"; 49 + static int cfg_sock_proto = IPPROTO_MPTCP; 50 + static bool tcpulp_audit; 51 + static int pf = AF_INET; 52 + static int cfg_sndbuf; 53 + 54 + static void die_usage(void) 55 + { 56 + fprintf(stderr, "Usage: mptcp_connect [-6] [-u] [-s MPTCP|TCP] [-p port] -m mode]" 57 + "[ -l ] [ -t timeout ] connect_address\n"); 58 + exit(1); 59 + } 60 + 61 + static const char *getxinfo_strerr(int err) 62 + { 63 + if (err == EAI_SYSTEM) 64 + return strerror(errno); 65 + 66 + return gai_strerror(err); 67 + } 68 + 69 + static void xgetnameinfo(const struct sockaddr *addr, socklen_t addrlen, 70 + char *host, socklen_t hostlen, 71 + char *serv, socklen_t servlen) 72 + { 73 + int flags = NI_NUMERICHOST | NI_NUMERICSERV; 74 + int err = getnameinfo(addr, addrlen, host, hostlen, serv, servlen, 75 + flags); 76 + 77 + if (err) { 78 + const char *errstr = getxinfo_strerr(err); 79 + 80 + fprintf(stderr, "Fatal: getnameinfo: %s\n", errstr); 81 + exit(1); 82 + } 83 + } 84 + 85 + static void xgetaddrinfo(const char *node, const char *service, 86 + const struct addrinfo *hints, 87 + struct addrinfo **res) 88 + { 89 + int err = getaddrinfo(node, service, hints, res); 90 + 91 + if (err) { 92 + const char *errstr = getxinfo_strerr(err); 93 + 94 + fprintf(stderr, "Fatal: getaddrinfo(%s:%s): %s\n", 95 + node ? node : "", service ? service : "", errstr); 96 + exit(1); 97 + } 98 + } 99 + 100 + static void set_sndbuf(int fd, unsigned int size) 101 + { 102 + int err; 103 + 104 + err = setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &size, sizeof(size)); 105 + if (err) { 106 + perror("set SO_SNDBUF"); 107 + exit(1); 108 + } 109 + } 110 + 111 + static int sock_listen_mptcp(const char * const listenaddr, 112 + const char * const port) 113 + { 114 + int sock; 115 + struct addrinfo hints = { 116 + .ai_protocol = IPPROTO_TCP, 117 + .ai_socktype = SOCK_STREAM, 118 + .ai_flags = AI_PASSIVE | AI_NUMERICHOST 119 + }; 120 + 121 + hints.ai_family = pf; 122 + 123 + struct addrinfo *a, *addr; 124 + int one = 1; 125 + 126 + xgetaddrinfo(listenaddr, port, &hints, &addr); 127 + hints.ai_family = pf; 128 + 129 + for (a = addr; a; a = a->ai_next) { 130 + sock = socket(a->ai_family, a->ai_socktype, cfg_sock_proto); 131 + if (sock < 0) 132 + continue; 133 + 134 + if (-1 == setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, &one, 135 + sizeof(one))) 136 + perror("setsockopt"); 137 + 138 + if (bind(sock, a->ai_addr, a->ai_addrlen) == 0) 139 + break; /* success */ 140 + 141 + perror("bind"); 142 + close(sock); 143 + sock = -1; 144 + } 145 + 146 + freeaddrinfo(addr); 147 + 148 + if (sock < 0) { 149 + fprintf(stderr, "Could not create listen socket\n"); 150 + return sock; 151 + } 152 + 153 + if (listen(sock, 20)) { 154 + perror("listen"); 155 + close(sock); 156 + return -1; 157 + } 158 + 159 + return sock; 160 + } 161 + 162 + static bool sock_test_tcpulp(const char * const remoteaddr, 163 + const char * const port) 164 + { 165 + struct addrinfo hints = { 166 + .ai_protocol = IPPROTO_TCP, 167 + .ai_socktype = SOCK_STREAM, 168 + }; 169 + struct addrinfo *a, *addr; 170 + int sock = -1, ret = 0; 171 + bool test_pass = false; 172 + 173 + hints.ai_family = AF_INET; 174 + 175 + xgetaddrinfo(remoteaddr, port, &hints, &addr); 176 + for (a = addr; a; a = a->ai_next) { 177 + sock = socket(a->ai_family, a->ai_socktype, IPPROTO_TCP); 178 + if (sock < 0) { 179 + perror("socket"); 180 + continue; 181 + } 182 + ret = setsockopt(sock, IPPROTO_TCP, TCP_ULP, "mptcp", 183 + sizeof("mptcp")); 184 + if (ret == -1 && errno == EOPNOTSUPP) 185 + test_pass = true; 186 + close(sock); 187 + 188 + if (test_pass) 189 + break; 190 + if (!ret) 191 + fprintf(stderr, 192 + "setsockopt(TCP_ULP) returned 0\n"); 193 + else 194 + perror("setsockopt(TCP_ULP)"); 195 + } 196 + return test_pass; 197 + } 198 + 199 + static int sock_connect_mptcp(const char * const remoteaddr, 200 + const char * const port, int proto) 201 + { 202 + struct addrinfo hints = { 203 + .ai_protocol = IPPROTO_TCP, 204 + .ai_socktype = SOCK_STREAM, 205 + }; 206 + struct addrinfo *a, *addr; 207 + int sock = -1; 208 + 209 + hints.ai_family = pf; 210 + 211 + xgetaddrinfo(remoteaddr, port, &hints, &addr); 212 + for (a = addr; a; a = a->ai_next) { 213 + sock = socket(a->ai_family, a->ai_socktype, proto); 214 + if (sock < 0) { 215 + perror("socket"); 216 + continue; 217 + } 218 + 219 + if (connect(sock, a->ai_addr, a->ai_addrlen) == 0) 220 + break; /* success */ 221 + 222 + perror("connect()"); 223 + close(sock); 224 + sock = -1; 225 + } 226 + 227 + freeaddrinfo(addr); 228 + return sock; 229 + } 230 + 231 + static size_t do_rnd_write(const int fd, char *buf, const size_t len) 232 + { 233 + unsigned int do_w; 234 + ssize_t bw; 235 + 236 + do_w = rand() & 0xffff; 237 + if (do_w == 0 || do_w > len) 238 + do_w = len; 239 + 240 + bw = write(fd, buf, do_w); 241 + if (bw < 0) 242 + perror("write"); 243 + 244 + return bw; 245 + } 246 + 247 + static size_t do_write(const int fd, char *buf, const size_t len) 248 + { 249 + size_t offset = 0; 250 + 251 + while (offset < len) { 252 + size_t written; 253 + ssize_t bw; 254 + 255 + bw = write(fd, buf + offset, len - offset); 256 + if (bw < 0) { 257 + perror("write"); 258 + return 0; 259 + } 260 + 261 + written = (size_t)bw; 262 + offset += written; 263 + } 264 + 265 + return offset; 266 + } 267 + 268 + static ssize_t do_rnd_read(const int fd, char *buf, const size_t len) 269 + { 270 + size_t cap = rand(); 271 + 272 + cap &= 0xffff; 273 + 274 + if (cap == 0) 275 + cap = 1; 276 + else if (cap > len) 277 + cap = len; 278 + 279 + return read(fd, buf, cap); 280 + } 281 + 282 + static void set_nonblock(int fd) 283 + { 284 + int flags = fcntl(fd, F_GETFL); 285 + 286 + if (flags == -1) 287 + return; 288 + 289 + fcntl(fd, F_SETFL, flags | O_NONBLOCK); 290 + } 291 + 292 + static int copyfd_io_poll(int infd, int peerfd, int outfd) 293 + { 294 + struct pollfd fds = { 295 + .fd = peerfd, 296 + .events = POLLIN | POLLOUT, 297 + }; 298 + unsigned int woff = 0, wlen = 0; 299 + char wbuf[8192]; 300 + 301 + set_nonblock(peerfd); 302 + 303 + for (;;) { 304 + char rbuf[8192]; 305 + ssize_t len; 306 + 307 + if (fds.events == 0) 308 + break; 309 + 310 + switch (poll(&fds, 1, poll_timeout)) { 311 + case -1: 312 + if (errno == EINTR) 313 + continue; 314 + perror("poll"); 315 + return 1; 316 + case 0: 317 + fprintf(stderr, "%s: poll timed out (events: " 318 + "POLLIN %u, POLLOUT %u)\n", __func__, 319 + fds.events & POLLIN, fds.events & POLLOUT); 320 + return 2; 321 + } 322 + 323 + if (fds.revents & POLLIN) { 324 + len = do_rnd_read(peerfd, rbuf, sizeof(rbuf)); 325 + if (len == 0) { 326 + /* no more data to receive: 327 + * peer has closed its write side 328 + */ 329 + fds.events &= ~POLLIN; 330 + 331 + if ((fds.events & POLLOUT) == 0) 332 + /* and nothing more to send */ 333 + break; 334 + 335 + /* Else, still have data to transmit */ 336 + } else if (len < 0) { 337 + perror("read"); 338 + return 3; 339 + } 340 + 341 + do_write(outfd, rbuf, len); 342 + } 343 + 344 + if (fds.revents & POLLOUT) { 345 + if (wlen == 0) { 346 + woff = 0; 347 + wlen = read(infd, wbuf, sizeof(wbuf)); 348 + } 349 + 350 + if (wlen > 0) { 351 + ssize_t bw; 352 + 353 + bw = do_rnd_write(peerfd, wbuf + woff, wlen); 354 + if (bw < 0) 355 + return 111; 356 + 357 + woff += bw; 358 + wlen -= bw; 359 + } else if (wlen == 0) { 360 + /* We have no more data to send. */ 361 + fds.events &= ~POLLOUT; 362 + 363 + if ((fds.events & POLLIN) == 0) 364 + /* ... and peer also closed already */ 365 + break; 366 + 367 + /* ... but we still receive. 368 + * Close our write side. 369 + */ 370 + shutdown(peerfd, SHUT_WR); 371 + } else { 372 + if (errno == EINTR) 373 + continue; 374 + perror("read"); 375 + return 4; 376 + } 377 + } 378 + 379 + if (fds.revents & (POLLERR | POLLNVAL)) { 380 + fprintf(stderr, "Unexpected revents: " 381 + "POLLERR/POLLNVAL(%x)\n", fds.revents); 382 + return 5; 383 + } 384 + } 385 + 386 + close(peerfd); 387 + return 0; 388 + } 389 + 390 + static int do_recvfile(int infd, int outfd) 391 + { 392 + ssize_t r; 393 + 394 + do { 395 + char buf[16384]; 396 + 397 + r = do_rnd_read(infd, buf, sizeof(buf)); 398 + if (r > 0) { 399 + if (write(outfd, buf, r) != r) 400 + break; 401 + } else if (r < 0) { 402 + perror("read"); 403 + } 404 + } while (r > 0); 405 + 406 + return (int)r; 407 + } 408 + 409 + static int do_mmap(int infd, int outfd, unsigned int size) 410 + { 411 + char *inbuf = mmap(NULL, size, PROT_READ, MAP_SHARED, infd, 0); 412 + ssize_t ret = 0, off = 0; 413 + size_t rem; 414 + 415 + if (inbuf == MAP_FAILED) { 416 + perror("mmap"); 417 + return 1; 418 + } 419 + 420 + rem = size; 421 + 422 + while (rem > 0) { 423 + ret = write(outfd, inbuf + off, rem); 424 + 425 + if (ret < 0) { 426 + perror("write"); 427 + break; 428 + } 429 + 430 + off += ret; 431 + rem -= ret; 432 + } 433 + 434 + munmap(inbuf, size); 435 + return rem; 436 + } 437 + 438 + static int get_infd_size(int fd) 439 + { 440 + struct stat sb; 441 + ssize_t count; 442 + int err; 443 + 444 + err = fstat(fd, &sb); 445 + if (err < 0) { 446 + perror("fstat"); 447 + return -1; 448 + } 449 + 450 + if ((sb.st_mode & S_IFMT) != S_IFREG) { 451 + fprintf(stderr, "%s: stdin is not a regular file\n", __func__); 452 + return -2; 453 + } 454 + 455 + count = sb.st_size; 456 + if (count > INT_MAX) { 457 + fprintf(stderr, "File too large: %zu\n", count); 458 + return -3; 459 + } 460 + 461 + return (int)count; 462 + } 463 + 464 + static int do_sendfile(int infd, int outfd, unsigned int count) 465 + { 466 + while (count > 0) { 467 + ssize_t r; 468 + 469 + r = sendfile(outfd, infd, NULL, count); 470 + if (r < 0) { 471 + perror("sendfile"); 472 + return 3; 473 + } 474 + 475 + count -= r; 476 + } 477 + 478 + return 0; 479 + } 480 + 481 + static int copyfd_io_mmap(int infd, int peerfd, int outfd, 482 + unsigned int size) 483 + { 484 + int err; 485 + 486 + if (listen_mode) { 487 + err = do_recvfile(peerfd, outfd); 488 + if (err) 489 + return err; 490 + 491 + err = do_mmap(infd, peerfd, size); 492 + } else { 493 + err = do_mmap(infd, peerfd, size); 494 + if (err) 495 + return err; 496 + 497 + shutdown(peerfd, SHUT_WR); 498 + 499 + err = do_recvfile(peerfd, outfd); 500 + } 501 + 502 + return err; 503 + } 504 + 505 + static int copyfd_io_sendfile(int infd, int peerfd, int outfd, 506 + unsigned int size) 507 + { 508 + int err; 509 + 510 + if (listen_mode) { 511 + err = do_recvfile(peerfd, outfd); 512 + if (err) 513 + return err; 514 + 515 + err = do_sendfile(infd, peerfd, size); 516 + } else { 517 + err = do_sendfile(infd, peerfd, size); 518 + if (err) 519 + return err; 520 + err = do_recvfile(peerfd, outfd); 521 + } 522 + 523 + return err; 524 + } 525 + 526 + static int copyfd_io(int infd, int peerfd, int outfd) 527 + { 528 + int file_size; 529 + 530 + switch (cfg_mode) { 531 + case CFG_MODE_POLL: 532 + return copyfd_io_poll(infd, peerfd, outfd); 533 + case CFG_MODE_MMAP: 534 + file_size = get_infd_size(infd); 535 + if (file_size < 0) 536 + return file_size; 537 + return copyfd_io_mmap(infd, peerfd, outfd, file_size); 538 + case CFG_MODE_SENDFILE: 539 + file_size = get_infd_size(infd); 540 + if (file_size < 0) 541 + return file_size; 542 + return copyfd_io_sendfile(infd, peerfd, outfd, file_size); 543 + } 544 + 545 + fprintf(stderr, "Invalid mode %d\n", cfg_mode); 546 + 547 + die_usage(); 548 + return 1; 549 + } 550 + 551 + static void check_sockaddr(int pf, struct sockaddr_storage *ss, 552 + socklen_t salen) 553 + { 554 + struct sockaddr_in6 *sin6; 555 + struct sockaddr_in *sin; 556 + socklen_t wanted_size = 0; 557 + 558 + switch (pf) { 559 + case AF_INET: 560 + wanted_size = sizeof(*sin); 561 + sin = (void *)ss; 562 + if (!sin->sin_port) 563 + fprintf(stderr, "accept: something wrong: ip connection from port 0"); 564 + break; 565 + case AF_INET6: 566 + wanted_size = sizeof(*sin6); 567 + sin6 = (void *)ss; 568 + if (!sin6->sin6_port) 569 + fprintf(stderr, "accept: something wrong: ipv6 connection from port 0"); 570 + break; 571 + default: 572 + fprintf(stderr, "accept: Unknown pf %d, salen %u\n", pf, salen); 573 + return; 574 + } 575 + 576 + if (salen != wanted_size) 577 + fprintf(stderr, "accept: size mismatch, got %d expected %d\n", 578 + (int)salen, wanted_size); 579 + 580 + if (ss->ss_family != pf) 581 + fprintf(stderr, "accept: pf mismatch, expect %d, ss_family is %d\n", 582 + (int)ss->ss_family, pf); 583 + } 584 + 585 + static void check_getpeername(int fd, struct sockaddr_storage *ss, socklen_t salen) 586 + { 587 + struct sockaddr_storage peerss; 588 + socklen_t peersalen = sizeof(peerss); 589 + 590 + if (getpeername(fd, (struct sockaddr *)&peerss, &peersalen) < 0) { 591 + perror("getpeername"); 592 + return; 593 + } 594 + 595 + if (peersalen != salen) { 596 + fprintf(stderr, "%s: %d vs %d\n", __func__, peersalen, salen); 597 + return; 598 + } 599 + 600 + if (memcmp(ss, &peerss, peersalen)) { 601 + char a[INET6_ADDRSTRLEN]; 602 + char b[INET6_ADDRSTRLEN]; 603 + char c[INET6_ADDRSTRLEN]; 604 + char d[INET6_ADDRSTRLEN]; 605 + 606 + xgetnameinfo((struct sockaddr *)ss, salen, 607 + a, sizeof(a), b, sizeof(b)); 608 + 609 + xgetnameinfo((struct sockaddr *)&peerss, peersalen, 610 + c, sizeof(c), d, sizeof(d)); 611 + 612 + fprintf(stderr, "%s: memcmp failure: accept %s vs peername %s, %s vs %s salen %d vs %d\n", 613 + __func__, a, c, b, d, peersalen, salen); 614 + } 615 + } 616 + 617 + static void check_getpeername_connect(int fd) 618 + { 619 + struct sockaddr_storage ss; 620 + socklen_t salen = sizeof(ss); 621 + char a[INET6_ADDRSTRLEN]; 622 + char b[INET6_ADDRSTRLEN]; 623 + 624 + if (getpeername(fd, (struct sockaddr *)&ss, &salen) < 0) { 625 + perror("getpeername"); 626 + return; 627 + } 628 + 629 + xgetnameinfo((struct sockaddr *)&ss, salen, 630 + a, sizeof(a), b, sizeof(b)); 631 + 632 + if (strcmp(cfg_host, a) || strcmp(cfg_port, b)) 633 + fprintf(stderr, "%s: %s vs %s, %s vs %s\n", __func__, 634 + cfg_host, a, cfg_port, b); 635 + } 636 + 637 + int main_loop_s(int listensock) 638 + { 639 + struct sockaddr_storage ss; 640 + struct pollfd polls; 641 + socklen_t salen; 642 + int remotesock; 643 + 644 + polls.fd = listensock; 645 + polls.events = POLLIN; 646 + 647 + switch (poll(&polls, 1, poll_timeout)) { 648 + case -1: 649 + perror("poll"); 650 + return 1; 651 + case 0: 652 + fprintf(stderr, "%s: timed out\n", __func__); 653 + close(listensock); 654 + return 2; 655 + } 656 + 657 + salen = sizeof(ss); 658 + remotesock = accept(listensock, (struct sockaddr *)&ss, &salen); 659 + if (remotesock >= 0) { 660 + check_sockaddr(pf, &ss, salen); 661 + check_getpeername(remotesock, &ss, salen); 662 + 663 + return copyfd_io(0, remotesock, 1); 664 + } 665 + 666 + perror("accept"); 667 + 668 + return 1; 669 + } 670 + 671 + static void init_rng(void) 672 + { 673 + int fd = open("/dev/urandom", O_RDONLY); 674 + unsigned int foo; 675 + 676 + if (fd > 0) { 677 + int ret = read(fd, &foo, sizeof(foo)); 678 + 679 + if (ret < 0) 680 + srand(fd + foo); 681 + close(fd); 682 + } 683 + 684 + srand(foo); 685 + } 686 + 687 + int main_loop(void) 688 + { 689 + int fd; 690 + 691 + /* listener is ready. */ 692 + fd = sock_connect_mptcp(cfg_host, cfg_port, cfg_sock_proto); 693 + if (fd < 0) 694 + return 2; 695 + 696 + check_getpeername_connect(fd); 697 + 698 + if (cfg_sndbuf) 699 + set_sndbuf(fd, cfg_sndbuf); 700 + 701 + return copyfd_io(0, fd, 1); 702 + } 703 + 704 + int parse_proto(const char *proto) 705 + { 706 + if (!strcasecmp(proto, "MPTCP")) 707 + return IPPROTO_MPTCP; 708 + if (!strcasecmp(proto, "TCP")) 709 + return IPPROTO_TCP; 710 + 711 + fprintf(stderr, "Unknown protocol: %s\n.", proto); 712 + die_usage(); 713 + 714 + /* silence compiler warning */ 715 + return 0; 716 + } 717 + 718 + int parse_mode(const char *mode) 719 + { 720 + if (!strcasecmp(mode, "poll")) 721 + return CFG_MODE_POLL; 722 + if (!strcasecmp(mode, "mmap")) 723 + return CFG_MODE_MMAP; 724 + if (!strcasecmp(mode, "sendfile")) 725 + return CFG_MODE_SENDFILE; 726 + 727 + fprintf(stderr, "Unknown test mode: %s\n", mode); 728 + fprintf(stderr, "Supported modes are:\n"); 729 + fprintf(stderr, "\t\t\"poll\" - interleaved read/write using poll()\n"); 730 + fprintf(stderr, "\t\t\"mmap\" - send entire input file (mmap+write), then read response (-l will read input first)\n"); 731 + fprintf(stderr, "\t\t\"sendfile\" - send entire input file (sendfile), then read response (-l will read input first)\n"); 732 + 733 + die_usage(); 734 + 735 + /* silence compiler warning */ 736 + return 0; 737 + } 738 + 739 + int parse_sndbuf(const char *size) 740 + { 741 + unsigned long s; 742 + 743 + errno = 0; 744 + 745 + s = strtoul(size, NULL, 0); 746 + 747 + if (errno) { 748 + fprintf(stderr, "Invalid sndbuf size %s (%s)\n", 749 + size, strerror(errno)); 750 + die_usage(); 751 + } 752 + 753 + if (s > INT_MAX) { 754 + fprintf(stderr, "Invalid sndbuf size %s (%s)\n", 755 + size, strerror(ERANGE)); 756 + die_usage(); 757 + } 758 + 759 + cfg_sndbuf = s; 760 + 761 + return 0; 762 + } 763 + 764 + static void parse_opts(int argc, char **argv) 765 + { 766 + int c; 767 + 768 + while ((c = getopt(argc, argv, "6lp:s:hut:m:b:")) != -1) { 769 + switch (c) { 770 + case 'l': 771 + listen_mode = true; 772 + break; 773 + case 'p': 774 + cfg_port = optarg; 775 + break; 776 + case 's': 777 + cfg_sock_proto = parse_proto(optarg); 778 + break; 779 + case 'h': 780 + die_usage(); 781 + break; 782 + case 'u': 783 + tcpulp_audit = true; 784 + break; 785 + case '6': 786 + pf = AF_INET6; 787 + break; 788 + case 't': 789 + poll_timeout = atoi(optarg) * 1000; 790 + if (poll_timeout <= 0) 791 + poll_timeout = -1; 792 + break; 793 + case 'm': 794 + cfg_mode = parse_mode(optarg); 795 + break; 796 + case 'b': 797 + cfg_sndbuf = parse_sndbuf(optarg); 798 + break; 799 + } 800 + } 801 + 802 + if (optind + 1 != argc) 803 + die_usage(); 804 + cfg_host = argv[optind]; 805 + 806 + if (strchr(cfg_host, ':')) 807 + pf = AF_INET6; 808 + } 809 + 810 + int main(int argc, char *argv[]) 811 + { 812 + init_rng(); 813 + 814 + parse_opts(argc, argv); 815 + 816 + if (tcpulp_audit) 817 + return sock_test_tcpulp(cfg_host, cfg_port) ? 0 : 1; 818 + 819 + if (listen_mode) { 820 + int fd = sock_listen_mptcp(cfg_host, cfg_port); 821 + 822 + if (fd < 0) 823 + return 1; 824 + 825 + if (cfg_sndbuf) 826 + set_sndbuf(fd, cfg_sndbuf); 827 + 828 + return main_loop_s(fd); 829 + } 830 + 831 + return main_loop(); 832 + }
+595
tools/testing/selftests/net/mptcp/mptcp_connect.sh
··· 1 + #!/bin/bash 2 + # SPDX-License-Identifier: GPL-2.0 3 + 4 + time_start=$(date +%s) 5 + 6 + optstring="b:d:e:l:r:h4cm:" 7 + ret=0 8 + sin="" 9 + sout="" 10 + cin="" 11 + cout="" 12 + ksft_skip=4 13 + capture=false 14 + timeout=30 15 + ipv6=true 16 + ethtool_random_on=true 17 + tc_delay="$((RANDOM%400))" 18 + tc_loss=$((RANDOM%101)) 19 + tc_reorder="" 20 + testmode="" 21 + sndbuf=0 22 + options_log=true 23 + 24 + if [ $tc_loss -eq 100 ];then 25 + tc_loss=1% 26 + elif [ $tc_loss -ge 10 ]; then 27 + tc_loss=0.$tc_loss% 28 + elif [ $tc_loss -ge 1 ]; then 29 + tc_loss=0.0$tc_loss% 30 + else 31 + tc_loss="" 32 + fi 33 + 34 + usage() { 35 + echo "Usage: $0 [ -a ]" 36 + echo -e "\t-d: tc/netem delay in milliseconds, e.g. \"-d 10\" (default random)" 37 + echo -e "\t-l: tc/netem loss percentage, e.g. \"-l 0.02\" (default random)" 38 + echo -e "\t-r: tc/netem reorder mode, e.g. \"-r 25% 50% gap 5\", use "-r 0" to disable reordering (default random)" 39 + echo -e "\t-e: ethtool features to disable, e.g.: \"-e tso -e gso\" (default: randomly disable any of tso/gso/gro)" 40 + echo -e "\t-4: IPv4 only: disable IPv6 tests (default: test both IPv4 and IPv6)" 41 + echo -e "\t-c: capture packets for each test using tcpdump (default: no capture)" 42 + echo -e "\t-b: set sndbuf value (default: use kernel default)" 43 + echo -e "\t-m: test mode (poll, sendfile; default: poll)" 44 + } 45 + 46 + while getopts "$optstring" option;do 47 + case "$option" in 48 + "h") 49 + usage $0 50 + exit 0 51 + ;; 52 + "d") 53 + if [ $OPTARG -ge 0 ];then 54 + tc_delay="$OPTARG" 55 + else 56 + echo "-d requires numeric argument, got \"$OPTARG\"" 1>&2 57 + exit 1 58 + fi 59 + ;; 60 + "e") 61 + ethtool_args="$ethtool_args $OPTARG off" 62 + ethtool_random_on=false 63 + ;; 64 + "l") 65 + tc_loss="$OPTARG" 66 + ;; 67 + "r") 68 + tc_reorder="$OPTARG" 69 + ;; 70 + "4") 71 + ipv6=false 72 + ;; 73 + "c") 74 + capture=true 75 + ;; 76 + "b") 77 + if [ $OPTARG -ge 0 ];then 78 + sndbuf="$OPTARG" 79 + else 80 + echo "-s requires numeric argument, got \"$OPTARG\"" 1>&2 81 + exit 1 82 + fi 83 + ;; 84 + "m") 85 + testmode="$OPTARG" 86 + ;; 87 + "?") 88 + usage $0 89 + exit 1 90 + ;; 91 + esac 92 + done 93 + 94 + sec=$(date +%s) 95 + rndh=$(printf %x $sec)-$(mktemp -u XXXXXX) 96 + ns1="ns1-$rndh" 97 + ns2="ns2-$rndh" 98 + ns3="ns3-$rndh" 99 + ns4="ns4-$rndh" 100 + 101 + TEST_COUNT=0 102 + 103 + cleanup() 104 + { 105 + rm -f "$cin" "$cout" 106 + rm -f "$sin" "$sout" 107 + rm -f "$capout" 108 + 109 + local netns 110 + for netns in "$ns1" "$ns2" "$ns3" "$ns4";do 111 + ip netns del $netns 112 + done 113 + } 114 + 115 + ip -Version > /dev/null 2>&1 116 + if [ $? -ne 0 ];then 117 + echo "SKIP: Could not run test without ip tool" 118 + exit $ksft_skip 119 + fi 120 + 121 + sin=$(mktemp) 122 + sout=$(mktemp) 123 + cin=$(mktemp) 124 + cout=$(mktemp) 125 + capout=$(mktemp) 126 + trap cleanup EXIT 127 + 128 + for i in "$ns1" "$ns2" "$ns3" "$ns4";do 129 + ip netns add $i || exit $ksft_skip 130 + ip -net $i link set lo up 131 + done 132 + 133 + # "$ns1" ns2 ns3 ns4 134 + # ns1eth2 ns2eth1 ns2eth3 ns3eth2 ns3eth4 ns4eth3 135 + # - drop 1% -> reorder 25% 136 + # <- TSO off - 137 + 138 + ip link add ns1eth2 netns "$ns1" type veth peer name ns2eth1 netns "$ns2" 139 + ip link add ns2eth3 netns "$ns2" type veth peer name ns3eth2 netns "$ns3" 140 + ip link add ns3eth4 netns "$ns3" type veth peer name ns4eth3 netns "$ns4" 141 + 142 + ip -net "$ns1" addr add 10.0.1.1/24 dev ns1eth2 143 + ip -net "$ns1" addr add dead:beef:1::1/64 dev ns1eth2 nodad 144 + 145 + ip -net "$ns1" link set ns1eth2 up 146 + ip -net "$ns1" route add default via 10.0.1.2 147 + ip -net "$ns1" route add default via dead:beef:1::2 148 + 149 + ip -net "$ns2" addr add 10.0.1.2/24 dev ns2eth1 150 + ip -net "$ns2" addr add dead:beef:1::2/64 dev ns2eth1 nodad 151 + ip -net "$ns2" link set ns2eth1 up 152 + 153 + ip -net "$ns2" addr add 10.0.2.1/24 dev ns2eth3 154 + ip -net "$ns2" addr add dead:beef:2::1/64 dev ns2eth3 nodad 155 + ip -net "$ns2" link set ns2eth3 up 156 + ip -net "$ns2" route add default via 10.0.2.2 157 + ip -net "$ns2" route add default via dead:beef:2::2 158 + ip netns exec "$ns2" sysctl -q net.ipv4.ip_forward=1 159 + ip netns exec "$ns2" sysctl -q net.ipv6.conf.all.forwarding=1 160 + 161 + ip -net "$ns3" addr add 10.0.2.2/24 dev ns3eth2 162 + ip -net "$ns3" addr add dead:beef:2::2/64 dev ns3eth2 nodad 163 + ip -net "$ns3" link set ns3eth2 up 164 + 165 + ip -net "$ns3" addr add 10.0.3.2/24 dev ns3eth4 166 + ip -net "$ns3" addr add dead:beef:3::2/64 dev ns3eth4 nodad 167 + ip -net "$ns3" link set ns3eth4 up 168 + ip -net "$ns3" route add default via 10.0.2.1 169 + ip -net "$ns3" route add default via dead:beef:2::1 170 + ip netns exec "$ns3" sysctl -q net.ipv4.ip_forward=1 171 + ip netns exec "$ns3" sysctl -q net.ipv6.conf.all.forwarding=1 172 + 173 + ip -net "$ns4" addr add 10.0.3.1/24 dev ns4eth3 174 + ip -net "$ns4" addr add dead:beef:3::1/64 dev ns4eth3 nodad 175 + ip -net "$ns4" link set ns4eth3 up 176 + ip -net "$ns4" route add default via 10.0.3.2 177 + ip -net "$ns4" route add default via dead:beef:3::2 178 + 179 + set_ethtool_flags() { 180 + local ns="$1" 181 + local dev="$2" 182 + local flags="$3" 183 + 184 + ip netns exec $ns ethtool -K $dev $flags 2>/dev/null 185 + [ $? -eq 0 ] && echo "INFO: set $ns dev $dev: ethtool -K $flags" 186 + } 187 + 188 + set_random_ethtool_flags() { 189 + local flags="" 190 + local r=$RANDOM 191 + 192 + local pick1=$((r & 1)) 193 + local pick2=$((r & 2)) 194 + local pick3=$((r & 4)) 195 + 196 + [ $pick1 -ne 0 ] && flags="tso off" 197 + [ $pick2 -ne 0 ] && flags="$flags gso off" 198 + [ $pick3 -ne 0 ] && flags="$flags gro off" 199 + 200 + [ -z "$flags" ] && return 201 + 202 + set_ethtool_flags "$1" "$2" "$flags" 203 + } 204 + 205 + if $ethtool_random_on;then 206 + set_random_ethtool_flags "$ns3" ns3eth2 207 + set_random_ethtool_flags "$ns4" ns4eth3 208 + else 209 + set_ethtool_flags "$ns3" ns3eth2 "$ethtool_args" 210 + set_ethtool_flags "$ns4" ns4eth3 "$ethtool_args" 211 + fi 212 + 213 + print_file_err() 214 + { 215 + ls -l "$1" 1>&2 216 + echo "Trailing bytes are: " 217 + tail -c 27 "$1" 218 + } 219 + 220 + check_transfer() 221 + { 222 + local in=$1 223 + local out=$2 224 + local what=$3 225 + 226 + cmp "$in" "$out" > /dev/null 2>&1 227 + if [ $? -ne 0 ] ;then 228 + echo "[ FAIL ] $what does not match (in, out):" 229 + print_file_err "$in" 230 + print_file_err "$out" 231 + 232 + return 1 233 + fi 234 + 235 + return 0 236 + } 237 + 238 + check_mptcp_disabled() 239 + { 240 + local disabled_ns 241 + disabled_ns="ns_disabled-$sech-$(mktemp -u XXXXXX)" 242 + ip netns add ${disabled_ns} || exit $ksft_skip 243 + 244 + # net.mptcp.enabled should be enabled by default 245 + if [ "$(ip netns exec ${disabled_ns} sysctl net.mptcp.enabled | awk '{ print $3 }')" -ne 1 ]; then 246 + echo -e "net.mptcp.enabled sysctl is not 1 by default\t\t[ FAIL ]" 247 + ret=1 248 + return 1 249 + fi 250 + ip netns exec ${disabled_ns} sysctl -q net.mptcp.enabled=0 251 + 252 + local err=0 253 + LANG=C ip netns exec ${disabled_ns} ./mptcp_connect -t $timeout -p 10000 -s MPTCP 127.0.0.1 < "$cin" 2>&1 | \ 254 + grep -q "^socket: Protocol not available$" && err=1 255 + ip netns delete ${disabled_ns} 256 + 257 + if [ ${err} -eq 0 ]; then 258 + echo -e "New MPTCP socket cannot be blocked via sysctl\t\t[ FAIL ]" 259 + ret=1 260 + return 1 261 + fi 262 + 263 + echo -e "New MPTCP socket can be blocked via sysctl\t\t[ OK ]" 264 + return 0 265 + } 266 + 267 + check_mptcp_ulp_setsockopt() 268 + { 269 + local t retval 270 + t="ns_ulp-$sech-$(mktemp -u XXXXXX)" 271 + 272 + ip netns add ${t} || exit $ksft_skip 273 + if ! ip netns exec ${t} ./mptcp_connect -u -p 10000 -s TCP 127.0.0.1 2>&1; then 274 + printf "setsockopt(..., TCP_ULP, \"mptcp\", ...) allowed\t[ FAIL ]\n" 275 + retval=1 276 + ret=$retval 277 + else 278 + printf "setsockopt(..., TCP_ULP, \"mptcp\", ...) blocked\t[ OK ]\n" 279 + retval=0 280 + fi 281 + ip netns del ${t} 282 + return $retval 283 + } 284 + 285 + # $1: IP address 286 + is_v6() 287 + { 288 + [ -z "${1##*:*}" ] 289 + } 290 + 291 + do_ping() 292 + { 293 + local listener_ns="$1" 294 + local connector_ns="$2" 295 + local connect_addr="$3" 296 + local ping_args="-q -c 1" 297 + 298 + if is_v6 "${connect_addr}"; then 299 + $ipv6 || return 0 300 + ping_args="${ping_args} -6" 301 + fi 302 + 303 + ip netns exec ${connector_ns} ping ${ping_args} $connect_addr >/dev/null 304 + if [ $? -ne 0 ] ; then 305 + echo "$listener_ns -> $connect_addr connectivity [ FAIL ]" 1>&2 306 + ret=1 307 + 308 + return 1 309 + fi 310 + 311 + return 0 312 + } 313 + 314 + # $1: ns, $2: port 315 + wait_local_port_listen() 316 + { 317 + local listener_ns="${1}" 318 + local port="${2}" 319 + 320 + local port_hex i 321 + 322 + port_hex="$(printf "%04X" "${port}")" 323 + for i in $(seq 10); do 324 + ip netns exec "${listener_ns}" cat /proc/net/tcp* | \ 325 + awk "BEGIN {rc=1} {if (\$2 ~ /:${port_hex}\$/ && \$4 ~ /0A/) {rc=0; exit}} END {exit rc}" && 326 + break 327 + sleep 0.1 328 + done 329 + } 330 + 331 + do_transfer() 332 + { 333 + local listener_ns="$1" 334 + local connector_ns="$2" 335 + local cl_proto="$3" 336 + local srv_proto="$4" 337 + local connect_addr="$5" 338 + local local_addr="$6" 339 + local extra_args="" 340 + 341 + local port 342 + port=$((10000+$TEST_COUNT)) 343 + TEST_COUNT=$((TEST_COUNT+1)) 344 + 345 + if [ "$sndbuf" -gt 0 ]; then 346 + extra_args="$extra_args -b $sndbuf" 347 + fi 348 + 349 + if [ -n "$testmode" ]; then 350 + extra_args="$extra_args -m $testmode" 351 + fi 352 + 353 + if [ -n "$extra_args" ] && $options_log; then 354 + options_log=false 355 + echo "INFO: extra options: $extra_args" 356 + fi 357 + 358 + :> "$cout" 359 + :> "$sout" 360 + :> "$capout" 361 + 362 + local addr_port 363 + addr_port=$(printf "%s:%d" ${connect_addr} ${port}) 364 + printf "%.3s %-5s -> %.3s (%-20s) %-5s\t" ${connector_ns} ${cl_proto} ${listener_ns} ${addr_port} ${srv_proto} 365 + 366 + if $capture; then 367 + local capuser 368 + if [ -z $SUDO_USER ] ; then 369 + capuser="" 370 + else 371 + capuser="-Z $SUDO_USER" 372 + fi 373 + 374 + local capfile="${listener_ns}-${connector_ns}-${cl_proto}-${srv_proto}-${connect_addr}.pcap" 375 + 376 + ip netns exec ${listener_ns} tcpdump -i any -s 65535 -B 32768 $capuser -w $capfile > "$capout" 2>&1 & 377 + local cappid=$! 378 + 379 + sleep 1 380 + fi 381 + 382 + ip netns exec ${listener_ns} ./mptcp_connect -t $timeout -l -p $port -s ${srv_proto} $extra_args $local_addr < "$sin" > "$sout" & 383 + local spid=$! 384 + 385 + wait_local_port_listen "${listener_ns}" "${port}" 386 + 387 + local start 388 + start=$(date +%s%3N) 389 + ip netns exec ${connector_ns} ./mptcp_connect -t $timeout -p $port -s ${cl_proto} $extra_args $connect_addr < "$cin" > "$cout" & 390 + local cpid=$! 391 + 392 + wait $cpid 393 + local retc=$? 394 + wait $spid 395 + local rets=$? 396 + 397 + local stop 398 + stop=$(date +%s%3N) 399 + 400 + if $capture; then 401 + sleep 1 402 + kill $cappid 403 + fi 404 + 405 + local duration 406 + duration=$((stop-start)) 407 + duration=$(printf "(duration %05sms)" $duration) 408 + if [ ${rets} -ne 0 ] || [ ${retc} -ne 0 ]; then 409 + echo "$duration [ FAIL ] client exit code $retc, server $rets" 1>&2 410 + echo "\nnetns ${listener_ns} socket stat for $port:" 1>&2 411 + ip netns exec ${listener_ns} ss -nita 1>&2 -o "sport = :$port" 412 + echo "\nnetns ${connector_ns} socket stat for $port:" 1>&2 413 + ip netns exec ${connector_ns} ss -nita 1>&2 -o "dport = :$port" 414 + 415 + cat "$capout" 416 + return 1 417 + fi 418 + 419 + check_transfer $sin $cout "file received by client" 420 + retc=$? 421 + check_transfer $cin $sout "file received by server" 422 + rets=$? 423 + 424 + if [ $retc -eq 0 ] && [ $rets -eq 0 ];then 425 + echo "$duration [ OK ]" 426 + cat "$capout" 427 + return 0 428 + fi 429 + 430 + cat "$capout" 431 + return 1 432 + } 433 + 434 + make_file() 435 + { 436 + local name=$1 437 + local who=$2 438 + 439 + local SIZE TSIZE 440 + SIZE=$((RANDOM % (1024 * 8))) 441 + TSIZE=$((SIZE * 1024)) 442 + 443 + dd if=/dev/urandom of="$name" bs=1024 count=$SIZE 2> /dev/null 444 + 445 + SIZE=$((RANDOM % 1024)) 446 + SIZE=$((SIZE + 128)) 447 + TSIZE=$((TSIZE + SIZE)) 448 + dd if=/dev/urandom conv=notrunc of="$name" bs=1 count=$SIZE 2> /dev/null 449 + echo -e "\nMPTCP_TEST_FILE_END_MARKER" >> "$name" 450 + 451 + echo "Created $name (size $TSIZE) containing data sent by $who" 452 + } 453 + 454 + run_tests_lo() 455 + { 456 + local listener_ns="$1" 457 + local connector_ns="$2" 458 + local connect_addr="$3" 459 + local loopback="$4" 460 + local lret=0 461 + 462 + # skip if test programs are running inside same netns for subsequent runs. 463 + if [ $loopback -eq 0 ] && [ ${listener_ns} = ${connector_ns} ]; then 464 + return 0 465 + fi 466 + 467 + # skip if we don't want v6 468 + if ! $ipv6 && is_v6 "${connect_addr}"; then 469 + return 0 470 + fi 471 + 472 + local local_addr 473 + if is_v6 "${connect_addr}"; then 474 + local_addr="::" 475 + else 476 + local_addr="0.0.0.0" 477 + fi 478 + 479 + do_transfer ${listener_ns} ${connector_ns} MPTCP MPTCP ${connect_addr} ${local_addr} 480 + lret=$? 481 + if [ $lret -ne 0 ]; then 482 + ret=$lret 483 + return 1 484 + fi 485 + 486 + # don't bother testing fallback tcp except for loopback case. 487 + if [ ${listener_ns} != ${connector_ns} ]; then 488 + return 0 489 + fi 490 + 491 + do_transfer ${listener_ns} ${connector_ns} MPTCP TCP ${connect_addr} ${local_addr} 492 + lret=$? 493 + if [ $lret -ne 0 ]; then 494 + ret=$lret 495 + return 1 496 + fi 497 + 498 + do_transfer ${listener_ns} ${connector_ns} TCP MPTCP ${connect_addr} ${local_addr} 499 + lret=$? 500 + if [ $lret -ne 0 ]; then 501 + ret=$lret 502 + return 1 503 + fi 504 + 505 + return 0 506 + } 507 + 508 + run_tests() 509 + { 510 + run_tests_lo $1 $2 $3 0 511 + } 512 + 513 + make_file "$cin" "client" 514 + make_file "$sin" "server" 515 + 516 + check_mptcp_disabled 517 + 518 + check_mptcp_ulp_setsockopt 519 + 520 + echo "INFO: validating network environment with pings" 521 + for sender in "$ns1" "$ns2" "$ns3" "$ns4";do 522 + do_ping "$ns1" $sender 10.0.1.1 523 + do_ping "$ns1" $sender dead:beef:1::1 524 + 525 + do_ping "$ns2" $sender 10.0.1.2 526 + do_ping "$ns2" $sender dead:beef:1::2 527 + do_ping "$ns2" $sender 10.0.2.1 528 + do_ping "$ns2" $sender dead:beef:2::1 529 + 530 + do_ping "$ns3" $sender 10.0.2.2 531 + do_ping "$ns3" $sender dead:beef:2::2 532 + do_ping "$ns3" $sender 10.0.3.2 533 + do_ping "$ns3" $sender dead:beef:3::2 534 + 535 + do_ping "$ns4" $sender 10.0.3.1 536 + do_ping "$ns4" $sender dead:beef:3::1 537 + done 538 + 539 + [ -n "$tc_loss" ] && tc -net "$ns2" qdisc add dev ns2eth3 root netem loss random $tc_loss 540 + echo -n "INFO: Using loss of $tc_loss " 541 + test "$tc_delay" -gt 0 && echo -n "delay $tc_delay ms " 542 + 543 + if [ -z "${tc_reorder}" ]; then 544 + reorder1=$((RANDOM%10)) 545 + reorder1=$((100 - reorder1)) 546 + reorder2=$((RANDOM%100)) 547 + 548 + if [ $tc_delay -gt 0 ] && [ $reorder1 -lt 100 ] && [ $reorder2 -gt 0 ]; then 549 + tc_reorder="reorder ${reorder1}% ${reorder2}%" 550 + echo -n "$tc_reorder " 551 + fi 552 + elif [ "$tc_reorder" = "0" ];then 553 + tc_reorder="" 554 + elif [ "$tc_delay" -gt 0 ];then 555 + # reordering requires some delay 556 + tc_reorder="reorder $tc_reorder" 557 + echo -n "$tc_reorder " 558 + fi 559 + 560 + echo "on ns3eth4" 561 + 562 + tc -net "$ns3" qdisc add dev ns3eth4 root netem delay ${tc_delay}ms $tc_reorder 563 + 564 + for sender in $ns1 $ns2 $ns3 $ns4;do 565 + run_tests_lo "$ns1" "$sender" 10.0.1.1 1 566 + if [ $ret -ne 0 ] ;then 567 + echo "FAIL: Could not even run loopback test" 1>&2 568 + exit $ret 569 + fi 570 + run_tests_lo "$ns1" $sender dead:beef:1::1 1 571 + if [ $ret -ne 0 ] ;then 572 + echo "FAIL: Could not even run loopback v6 test" 2>&1 573 + exit $ret 574 + fi 575 + 576 + run_tests "$ns2" $sender 10.0.1.2 577 + run_tests "$ns2" $sender dead:beef:1::2 578 + run_tests "$ns2" $sender 10.0.2.1 579 + run_tests "$ns2" $sender dead:beef:2::1 580 + 581 + run_tests "$ns3" $sender 10.0.2.2 582 + run_tests "$ns3" $sender dead:beef:2::2 583 + run_tests "$ns3" $sender 10.0.3.2 584 + run_tests "$ns3" $sender dead:beef:3::2 585 + 586 + run_tests "$ns4" $sender 10.0.3.1 587 + run_tests "$ns4" $sender dead:beef:3::1 588 + done 589 + 590 + time_end=$(date +%s) 591 + time_run=$((time_end-time_start)) 592 + 593 + echo "Time: ${time_run} seconds" 594 + 595 + exit $ret
+1
tools/testing/selftests/net/mptcp/settings
··· 1 + timeout=450