Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

bpf: Remove extra lock_sock for TCP_ZEROCOPY_RECEIVE

Add custom implementation of getsockopt hook for TCP_ZEROCOPY_RECEIVE.
We skip generic hooks for TCP_ZEROCOPY_RECEIVE and have a custom
call in do_tcp_getsockopt using the on-stack data. This removes
3% overhead for locking/unlocking the socket.

Without this patch:
3.38% 0.07% tcp_mmap [kernel.kallsyms] [k] __cgroup_bpf_run_filter_getsockopt
|
--3.30%--__cgroup_bpf_run_filter_getsockopt
|
--0.81%--__kmalloc

With the patch applied:
0.52% 0.12% tcp_mmap [kernel.kallsyms] [k] __cgroup_bpf_run_filter_getsockopt_kern

Note, exporting uapi/tcp.h requires removing netinet/tcp.h
from test_progs.h because those headers have confliciting
definitions.

Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20210115163501.805133-2-sdf@google.com

authored by

Stanislav Fomichev and committed by
Alexei Starovoitov
9cacf81f 13ca51d5

+506 -7
+23 -4
include/linux/bpf-cgroup.h
··· 147 147 int __user *optlen, int max_optlen, 148 148 int retval); 149 149 150 + int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level, 151 + int optname, void *optval, 152 + int *optlen, int retval); 153 + 150 154 static inline enum bpf_cgroup_storage_type cgroup_storage_type( 151 155 struct bpf_map *map) 152 156 { ··· 368 364 ({ \ 369 365 int __ret = retval; \ 370 366 if (cgroup_bpf_enabled) \ 371 - __ret = __cgroup_bpf_run_filter_getsockopt(sock, level, \ 372 - optname, optval, \ 373 - optlen, max_optlen, \ 374 - retval); \ 367 + if (!(sock)->sk_prot->bpf_bypass_getsockopt || \ 368 + !INDIRECT_CALL_INET_1((sock)->sk_prot->bpf_bypass_getsockopt, \ 369 + tcp_bpf_bypass_getsockopt, \ 370 + level, optname)) \ 371 + __ret = __cgroup_bpf_run_filter_getsockopt( \ 372 + sock, level, optname, optval, optlen, \ 373 + max_optlen, retval); \ 374 + __ret; \ 375 + }) 376 + 377 + #define BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sock, level, optname, optval, \ 378 + optlen, retval) \ 379 + ({ \ 380 + int __ret = retval; \ 381 + if (cgroup_bpf_enabled) \ 382 + __ret = __cgroup_bpf_run_filter_getsockopt_kern( \ 383 + sock, level, optname, optval, optlen, retval); \ 375 384 __ret; \ 376 385 }) 377 386 ··· 469 452 #define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) ({ 0; }) 470 453 #define BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock, level, optname, optval, \ 471 454 optlen, max_optlen, retval) ({ retval; }) 455 + #define BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sock, level, optname, optval, \ 456 + optlen, retval) ({ retval; }) 472 457 #define BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock, level, optname, optval, optlen, \ 473 458 kernel_optval) ({ 0; }) 474 459
+6
include/linux/indirect_call_wrapper.h
··· 60 60 #define INDIRECT_CALL_INET(f, f2, f1, ...) f(__VA_ARGS__) 61 61 #endif 62 62 63 + #if IS_ENABLED(CONFIG_INET) 64 + #define INDIRECT_CALL_INET_1(f, f1, ...) INDIRECT_CALL_1(f, f1, __VA_ARGS__) 65 + #else 66 + #define INDIRECT_CALL_INET_1(f, f1, ...) f(__VA_ARGS__) 67 + #endif 68 + 63 69 #endif
+2
include/net/sock.h
··· 1174 1174 1175 1175 int (*backlog_rcv) (struct sock *sk, 1176 1176 struct sk_buff *skb); 1177 + bool (*bpf_bypass_getsockopt)(int level, 1178 + int optname); 1177 1179 1178 1180 void (*release_cb)(struct sock *sk); 1179 1181
+1
include/net/tcp.h
··· 403 403 struct poll_table_struct *wait); 404 404 int tcp_getsockopt(struct sock *sk, int level, int optname, 405 405 char __user *optval, int __user *optlen); 406 + bool tcp_bpf_bypass_getsockopt(int level, int optname); 406 407 int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, 407 408 unsigned int optlen); 408 409 void tcp_set_keepalive(struct sock *sk, int val);
+46
kernel/bpf/cgroup.c
··· 1486 1486 sockopt_free_buf(&ctx); 1487 1487 return ret; 1488 1488 } 1489 + 1490 + int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level, 1491 + int optname, void *optval, 1492 + int *optlen, int retval) 1493 + { 1494 + struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); 1495 + struct bpf_sockopt_kern ctx = { 1496 + .sk = sk, 1497 + .level = level, 1498 + .optname = optname, 1499 + .retval = retval, 1500 + .optlen = *optlen, 1501 + .optval = optval, 1502 + .optval_end = optval + *optlen, 1503 + }; 1504 + int ret; 1505 + 1506 + /* Note that __cgroup_bpf_run_filter_getsockopt doesn't copy 1507 + * user data back into BPF buffer when reval != 0. This is 1508 + * done as an optimization to avoid extra copy, assuming 1509 + * kernel won't populate the data in case of an error. 1510 + * Here we always pass the data and memset() should 1511 + * be called if that data shouldn't be "exported". 1512 + */ 1513 + 1514 + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT], 1515 + &ctx, BPF_PROG_RUN); 1516 + if (!ret) 1517 + return -EPERM; 1518 + 1519 + if (ctx.optlen > *optlen) 1520 + return -EFAULT; 1521 + 1522 + /* BPF programs only allowed to set retval to 0, not some 1523 + * arbitrary value. 1524 + */ 1525 + if (ctx.retval != 0 && ctx.retval != retval) 1526 + return -EFAULT; 1527 + 1528 + /* BPF programs can shrink the buffer, export the modifications. 1529 + */ 1530 + if (ctx.optlen != 0) 1531 + *optlen = ctx.optlen; 1532 + 1533 + return ctx.retval; 1534 + } 1489 1535 #endif 1490 1536 1491 1537 static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp,
+14
net/ipv4/tcp.c
··· 4099 4099 return -EFAULT; 4100 4100 lock_sock(sk); 4101 4101 err = tcp_zerocopy_receive(sk, &zc); 4102 + err = BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sk, level, optname, 4103 + &zc, &len, err); 4102 4104 release_sock(sk); 4103 4105 if (len >= offsetofend(struct tcp_zerocopy_receive, err)) 4104 4106 goto zerocopy_rcv_sk_err; ··· 4134 4132 return -EFAULT; 4135 4133 return 0; 4136 4134 } 4135 + 4136 + bool tcp_bpf_bypass_getsockopt(int level, int optname) 4137 + { 4138 + /* TCP do_tcp_getsockopt has optimized getsockopt implementation 4139 + * to avoid extra socket lock for TCP_ZEROCOPY_RECEIVE. 4140 + */ 4141 + if (level == SOL_TCP && optname == TCP_ZEROCOPY_RECEIVE) 4142 + return true; 4143 + 4144 + return false; 4145 + } 4146 + EXPORT_SYMBOL(tcp_bpf_bypass_getsockopt); 4137 4147 4138 4148 int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, 4139 4149 int __user *optlen)
+1
net/ipv4/tcp_ipv4.c
··· 2793 2793 .shutdown = tcp_shutdown, 2794 2794 .setsockopt = tcp_setsockopt, 2795 2795 .getsockopt = tcp_getsockopt, 2796 + .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt, 2796 2797 .keepalive = tcp_set_keepalive, 2797 2798 .recvmsg = tcp_recvmsg, 2798 2799 .sendmsg = tcp_sendmsg,
+1
net/ipv6/tcp_ipv6.c
··· 2121 2121 .shutdown = tcp_shutdown, 2122 2122 .setsockopt = tcp_setsockopt, 2123 2123 .getsockopt = tcp_getsockopt, 2124 + .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt, 2124 2125 .keepalive = tcp_set_keepalive, 2125 2126 .recvmsg = tcp_recvmsg, 2126 2127 .sendmsg = tcp_sendmsg,
+3
net/socket.c
··· 2126 2126 return __sys_setsockopt(fd, level, optname, optval, optlen); 2127 2127 } 2128 2128 2129 + INDIRECT_CALLABLE_DECLARE(bool tcp_bpf_bypass_getsockopt(int level, 2130 + int optname)); 2131 + 2129 2132 /* 2130 2133 * Get a socket option. Because we don't know the option lengths we have 2131 2134 * to pass a user mode parameter for the protocols to sort out.
+357
tools/include/uapi/linux/tcp.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ 2 + /* 3 + * INET An implementation of the TCP/IP protocol suite for the LINUX 4 + * operating system. INET is implemented using the BSD Socket 5 + * interface as the means of communication with the user level. 6 + * 7 + * Definitions for the TCP protocol. 8 + * 9 + * Version: @(#)tcp.h 1.0.2 04/28/93 10 + * 11 + * Author: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 + * 13 + * This program is free software; you can redistribute it and/or 14 + * modify it under the terms of the GNU General Public License 15 + * as published by the Free Software Foundation; either version 16 + * 2 of the License, or (at your option) any later version. 17 + */ 18 + #ifndef _UAPI_LINUX_TCP_H 19 + #define _UAPI_LINUX_TCP_H 20 + 21 + #include <linux/types.h> 22 + #include <asm/byteorder.h> 23 + #include <linux/socket.h> 24 + 25 + struct tcphdr { 26 + __be16 source; 27 + __be16 dest; 28 + __be32 seq; 29 + __be32 ack_seq; 30 + #if defined(__LITTLE_ENDIAN_BITFIELD) 31 + __u16 res1:4, 32 + doff:4, 33 + fin:1, 34 + syn:1, 35 + rst:1, 36 + psh:1, 37 + ack:1, 38 + urg:1, 39 + ece:1, 40 + cwr:1; 41 + #elif defined(__BIG_ENDIAN_BITFIELD) 42 + __u16 doff:4, 43 + res1:4, 44 + cwr:1, 45 + ece:1, 46 + urg:1, 47 + ack:1, 48 + psh:1, 49 + rst:1, 50 + syn:1, 51 + fin:1; 52 + #else 53 + #error "Adjust your <asm/byteorder.h> defines" 54 + #endif 55 + __be16 window; 56 + __sum16 check; 57 + __be16 urg_ptr; 58 + }; 59 + 60 + /* 61 + * The union cast uses a gcc extension to avoid aliasing problems 62 + * (union is compatible to any of its members) 63 + * This means this part of the code is -fstrict-aliasing safe now. 64 + */ 65 + union tcp_word_hdr { 66 + struct tcphdr hdr; 67 + __be32 words[5]; 68 + }; 69 + 70 + #define tcp_flag_word(tp) ( ((union tcp_word_hdr *)(tp))->words [3]) 71 + 72 + enum { 73 + TCP_FLAG_CWR = __constant_cpu_to_be32(0x00800000), 74 + TCP_FLAG_ECE = __constant_cpu_to_be32(0x00400000), 75 + TCP_FLAG_URG = __constant_cpu_to_be32(0x00200000), 76 + TCP_FLAG_ACK = __constant_cpu_to_be32(0x00100000), 77 + TCP_FLAG_PSH = __constant_cpu_to_be32(0x00080000), 78 + TCP_FLAG_RST = __constant_cpu_to_be32(0x00040000), 79 + TCP_FLAG_SYN = __constant_cpu_to_be32(0x00020000), 80 + TCP_FLAG_FIN = __constant_cpu_to_be32(0x00010000), 81 + TCP_RESERVED_BITS = __constant_cpu_to_be32(0x0F000000), 82 + TCP_DATA_OFFSET = __constant_cpu_to_be32(0xF0000000) 83 + }; 84 + 85 + /* 86 + * TCP general constants 87 + */ 88 + #define TCP_MSS_DEFAULT 536U /* IPv4 (RFC1122, RFC2581) */ 89 + #define TCP_MSS_DESIRED 1220U /* IPv6 (tunneled), EDNS0 (RFC3226) */ 90 + 91 + /* TCP socket options */ 92 + #define TCP_NODELAY 1 /* Turn off Nagle's algorithm. */ 93 + #define TCP_MAXSEG 2 /* Limit MSS */ 94 + #define TCP_CORK 3 /* Never send partially complete segments */ 95 + #define TCP_KEEPIDLE 4 /* Start keeplives after this period */ 96 + #define TCP_KEEPINTVL 5 /* Interval between keepalives */ 97 + #define TCP_KEEPCNT 6 /* Number of keepalives before death */ 98 + #define TCP_SYNCNT 7 /* Number of SYN retransmits */ 99 + #define TCP_LINGER2 8 /* Life time of orphaned FIN-WAIT-2 state */ 100 + #define TCP_DEFER_ACCEPT 9 /* Wake up listener only when data arrive */ 101 + #define TCP_WINDOW_CLAMP 10 /* Bound advertised window */ 102 + #define TCP_INFO 11 /* Information about this connection. */ 103 + #define TCP_QUICKACK 12 /* Block/reenable quick acks */ 104 + #define TCP_CONGESTION 13 /* Congestion control algorithm */ 105 + #define TCP_MD5SIG 14 /* TCP MD5 Signature (RFC2385) */ 106 + #define TCP_THIN_LINEAR_TIMEOUTS 16 /* Use linear timeouts for thin streams*/ 107 + #define TCP_THIN_DUPACK 17 /* Fast retrans. after 1 dupack */ 108 + #define TCP_USER_TIMEOUT 18 /* How long for loss retry before timeout */ 109 + #define TCP_REPAIR 19 /* TCP sock is under repair right now */ 110 + #define TCP_REPAIR_QUEUE 20 111 + #define TCP_QUEUE_SEQ 21 112 + #define TCP_REPAIR_OPTIONS 22 113 + #define TCP_FASTOPEN 23 /* Enable FastOpen on listeners */ 114 + #define TCP_TIMESTAMP 24 115 + #define TCP_NOTSENT_LOWAT 25 /* limit number of unsent bytes in write queue */ 116 + #define TCP_CC_INFO 26 /* Get Congestion Control (optional) info */ 117 + #define TCP_SAVE_SYN 27 /* Record SYN headers for new connections */ 118 + #define TCP_SAVED_SYN 28 /* Get SYN headers recorded for connection */ 119 + #define TCP_REPAIR_WINDOW 29 /* Get/set window parameters */ 120 + #define TCP_FASTOPEN_CONNECT 30 /* Attempt FastOpen with connect */ 121 + #define TCP_ULP 31 /* Attach a ULP to a TCP connection */ 122 + #define TCP_MD5SIG_EXT 32 /* TCP MD5 Signature with extensions */ 123 + #define TCP_FASTOPEN_KEY 33 /* Set the key for Fast Open (cookie) */ 124 + #define TCP_FASTOPEN_NO_COOKIE 34 /* Enable TFO without a TFO cookie */ 125 + #define TCP_ZEROCOPY_RECEIVE 35 126 + #define TCP_INQ 36 /* Notify bytes available to read as a cmsg on read */ 127 + 128 + #define TCP_CM_INQ TCP_INQ 129 + 130 + #define TCP_TX_DELAY 37 /* delay outgoing packets by XX usec */ 131 + 132 + 133 + #define TCP_REPAIR_ON 1 134 + #define TCP_REPAIR_OFF 0 135 + #define TCP_REPAIR_OFF_NO_WP -1 /* Turn off without window probes */ 136 + 137 + struct tcp_repair_opt { 138 + __u32 opt_code; 139 + __u32 opt_val; 140 + }; 141 + 142 + struct tcp_repair_window { 143 + __u32 snd_wl1; 144 + __u32 snd_wnd; 145 + __u32 max_window; 146 + 147 + __u32 rcv_wnd; 148 + __u32 rcv_wup; 149 + }; 150 + 151 + enum { 152 + TCP_NO_QUEUE, 153 + TCP_RECV_QUEUE, 154 + TCP_SEND_QUEUE, 155 + TCP_QUEUES_NR, 156 + }; 157 + 158 + /* why fastopen failed from client perspective */ 159 + enum tcp_fastopen_client_fail { 160 + TFO_STATUS_UNSPEC, /* catch-all */ 161 + TFO_COOKIE_UNAVAILABLE, /* if not in TFO_CLIENT_NO_COOKIE mode */ 162 + TFO_DATA_NOT_ACKED, /* SYN-ACK did not ack SYN data */ 163 + TFO_SYN_RETRANSMITTED, /* SYN-ACK did not ack SYN data after timeout */ 164 + }; 165 + 166 + /* for TCP_INFO socket option */ 167 + #define TCPI_OPT_TIMESTAMPS 1 168 + #define TCPI_OPT_SACK 2 169 + #define TCPI_OPT_WSCALE 4 170 + #define TCPI_OPT_ECN 8 /* ECN was negociated at TCP session init */ 171 + #define TCPI_OPT_ECN_SEEN 16 /* we received at least one packet with ECT */ 172 + #define TCPI_OPT_SYN_DATA 32 /* SYN-ACK acked data in SYN sent or rcvd */ 173 + 174 + /* 175 + * Sender's congestion state indicating normal or abnormal situations 176 + * in the last round of packets sent. The state is driven by the ACK 177 + * information and timer events. 178 + */ 179 + enum tcp_ca_state { 180 + /* 181 + * Nothing bad has been observed recently. 182 + * No apparent reordering, packet loss, or ECN marks. 183 + */ 184 + TCP_CA_Open = 0, 185 + #define TCPF_CA_Open (1<<TCP_CA_Open) 186 + /* 187 + * The sender enters disordered state when it has received DUPACKs or 188 + * SACKs in the last round of packets sent. This could be due to packet 189 + * loss or reordering but needs further information to confirm packets 190 + * have been lost. 191 + */ 192 + TCP_CA_Disorder = 1, 193 + #define TCPF_CA_Disorder (1<<TCP_CA_Disorder) 194 + /* 195 + * The sender enters Congestion Window Reduction (CWR) state when it 196 + * has received ACKs with ECN-ECE marks, or has experienced congestion 197 + * or packet discard on the sender host (e.g. qdisc). 198 + */ 199 + TCP_CA_CWR = 2, 200 + #define TCPF_CA_CWR (1<<TCP_CA_CWR) 201 + /* 202 + * The sender is in fast recovery and retransmitting lost packets, 203 + * typically triggered by ACK events. 204 + */ 205 + TCP_CA_Recovery = 3, 206 + #define TCPF_CA_Recovery (1<<TCP_CA_Recovery) 207 + /* 208 + * The sender is in loss recovery triggered by retransmission timeout. 209 + */ 210 + TCP_CA_Loss = 4 211 + #define TCPF_CA_Loss (1<<TCP_CA_Loss) 212 + }; 213 + 214 + struct tcp_info { 215 + __u8 tcpi_state; 216 + __u8 tcpi_ca_state; 217 + __u8 tcpi_retransmits; 218 + __u8 tcpi_probes; 219 + __u8 tcpi_backoff; 220 + __u8 tcpi_options; 221 + __u8 tcpi_snd_wscale : 4, tcpi_rcv_wscale : 4; 222 + __u8 tcpi_delivery_rate_app_limited:1, tcpi_fastopen_client_fail:2; 223 + 224 + __u32 tcpi_rto; 225 + __u32 tcpi_ato; 226 + __u32 tcpi_snd_mss; 227 + __u32 tcpi_rcv_mss; 228 + 229 + __u32 tcpi_unacked; 230 + __u32 tcpi_sacked; 231 + __u32 tcpi_lost; 232 + __u32 tcpi_retrans; 233 + __u32 tcpi_fackets; 234 + 235 + /* Times. */ 236 + __u32 tcpi_last_data_sent; 237 + __u32 tcpi_last_ack_sent; /* Not remembered, sorry. */ 238 + __u32 tcpi_last_data_recv; 239 + __u32 tcpi_last_ack_recv; 240 + 241 + /* Metrics. */ 242 + __u32 tcpi_pmtu; 243 + __u32 tcpi_rcv_ssthresh; 244 + __u32 tcpi_rtt; 245 + __u32 tcpi_rttvar; 246 + __u32 tcpi_snd_ssthresh; 247 + __u32 tcpi_snd_cwnd; 248 + __u32 tcpi_advmss; 249 + __u32 tcpi_reordering; 250 + 251 + __u32 tcpi_rcv_rtt; 252 + __u32 tcpi_rcv_space; 253 + 254 + __u32 tcpi_total_retrans; 255 + 256 + __u64 tcpi_pacing_rate; 257 + __u64 tcpi_max_pacing_rate; 258 + __u64 tcpi_bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked */ 259 + __u64 tcpi_bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived */ 260 + __u32 tcpi_segs_out; /* RFC4898 tcpEStatsPerfSegsOut */ 261 + __u32 tcpi_segs_in; /* RFC4898 tcpEStatsPerfSegsIn */ 262 + 263 + __u32 tcpi_notsent_bytes; 264 + __u32 tcpi_min_rtt; 265 + __u32 tcpi_data_segs_in; /* RFC4898 tcpEStatsDataSegsIn */ 266 + __u32 tcpi_data_segs_out; /* RFC4898 tcpEStatsDataSegsOut */ 267 + 268 + __u64 tcpi_delivery_rate; 269 + 270 + __u64 tcpi_busy_time; /* Time (usec) busy sending data */ 271 + __u64 tcpi_rwnd_limited; /* Time (usec) limited by receive window */ 272 + __u64 tcpi_sndbuf_limited; /* Time (usec) limited by send buffer */ 273 + 274 + __u32 tcpi_delivered; 275 + __u32 tcpi_delivered_ce; 276 + 277 + __u64 tcpi_bytes_sent; /* RFC4898 tcpEStatsPerfHCDataOctetsOut */ 278 + __u64 tcpi_bytes_retrans; /* RFC4898 tcpEStatsPerfOctetsRetrans */ 279 + __u32 tcpi_dsack_dups; /* RFC4898 tcpEStatsStackDSACKDups */ 280 + __u32 tcpi_reord_seen; /* reordering events seen */ 281 + 282 + __u32 tcpi_rcv_ooopack; /* Out-of-order packets received */ 283 + 284 + __u32 tcpi_snd_wnd; /* peer's advertised receive window after 285 + * scaling (bytes) 286 + */ 287 + }; 288 + 289 + /* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */ 290 + enum { 291 + TCP_NLA_PAD, 292 + TCP_NLA_BUSY, /* Time (usec) busy sending data */ 293 + TCP_NLA_RWND_LIMITED, /* Time (usec) limited by receive window */ 294 + TCP_NLA_SNDBUF_LIMITED, /* Time (usec) limited by send buffer */ 295 + TCP_NLA_DATA_SEGS_OUT, /* Data pkts sent including retransmission */ 296 + TCP_NLA_TOTAL_RETRANS, /* Data pkts retransmitted */ 297 + TCP_NLA_PACING_RATE, /* Pacing rate in bytes per second */ 298 + TCP_NLA_DELIVERY_RATE, /* Delivery rate in bytes per second */ 299 + TCP_NLA_SND_CWND, /* Sending congestion window */ 300 + TCP_NLA_REORDERING, /* Reordering metric */ 301 + TCP_NLA_MIN_RTT, /* minimum RTT */ 302 + TCP_NLA_RECUR_RETRANS, /* Recurring retransmits for the current pkt */ 303 + TCP_NLA_DELIVERY_RATE_APP_LMT, /* delivery rate application limited ? */ 304 + TCP_NLA_SNDQ_SIZE, /* Data (bytes) pending in send queue */ 305 + TCP_NLA_CA_STATE, /* ca_state of socket */ 306 + TCP_NLA_SND_SSTHRESH, /* Slow start size threshold */ 307 + TCP_NLA_DELIVERED, /* Data pkts delivered incl. out-of-order */ 308 + TCP_NLA_DELIVERED_CE, /* Like above but only ones w/ CE marks */ 309 + TCP_NLA_BYTES_SENT, /* Data bytes sent including retransmission */ 310 + TCP_NLA_BYTES_RETRANS, /* Data bytes retransmitted */ 311 + TCP_NLA_DSACK_DUPS, /* DSACK blocks received */ 312 + TCP_NLA_REORD_SEEN, /* reordering events seen */ 313 + TCP_NLA_SRTT, /* smoothed RTT in usecs */ 314 + TCP_NLA_TIMEOUT_REHASH, /* Timeout-triggered rehash attempts */ 315 + TCP_NLA_BYTES_NOTSENT, /* Bytes in write queue not yet sent */ 316 + TCP_NLA_EDT, /* Earliest departure time (CLOCK_MONOTONIC) */ 317 + }; 318 + 319 + /* for TCP_MD5SIG socket option */ 320 + #define TCP_MD5SIG_MAXKEYLEN 80 321 + 322 + /* tcp_md5sig extension flags for TCP_MD5SIG_EXT */ 323 + #define TCP_MD5SIG_FLAG_PREFIX 0x1 /* address prefix length */ 324 + #define TCP_MD5SIG_FLAG_IFINDEX 0x2 /* ifindex set */ 325 + 326 + struct tcp_md5sig { 327 + struct __kernel_sockaddr_storage tcpm_addr; /* address associated */ 328 + __u8 tcpm_flags; /* extension flags */ 329 + __u8 tcpm_prefixlen; /* address prefix */ 330 + __u16 tcpm_keylen; /* key length */ 331 + int tcpm_ifindex; /* device index for scope */ 332 + __u8 tcpm_key[TCP_MD5SIG_MAXKEYLEN]; /* key (binary) */ 333 + }; 334 + 335 + /* INET_DIAG_MD5SIG */ 336 + struct tcp_diag_md5sig { 337 + __u8 tcpm_family; 338 + __u8 tcpm_prefixlen; 339 + __u16 tcpm_keylen; 340 + __be32 tcpm_addr[4]; 341 + __u8 tcpm_key[TCP_MD5SIG_MAXKEYLEN]; 342 + }; 343 + 344 + /* setsockopt(fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE, ...) */ 345 + 346 + #define TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT 0x1 347 + struct tcp_zerocopy_receive { 348 + __u64 address; /* in: address of mapping */ 349 + __u32 length; /* in/out: number of bytes to map/mapped */ 350 + __u32 recv_skip_hint; /* out: amount of bytes to skip */ 351 + __u32 inq; /* out: amount of bytes in read queue */ 352 + __s32 err; /* out: socket error */ 353 + __u64 copybuf_address; /* in: copybuf address (small reads) */ 354 + __s32 copybuf_len; /* in/out: copybuf bytes avail/used or error */ 355 + __u32 flags; /* in: flags */ 356 + }; 357 + #endif /* _UAPI_LINUX_TCP_H */
+1
tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
··· 2 2 /* Copyright (c) 2019 Facebook */ 3 3 4 4 #include <linux/err.h> 5 + #include <netinet/tcp.h> 5 6 #include <test_progs.h> 6 7 #include "bpf_dctcp.skel.h" 7 8 #include "bpf_cubic.skel.h"
+1
tools/testing/selftests/bpf/prog_tests/cls_redirect.c
··· 7 7 #include <string.h> 8 8 9 9 #include <linux/pkt_cls.h> 10 + #include <netinet/tcp.h> 10 11 11 12 #include <test_progs.h> 12 13
+1
tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
··· 1 1 // SPDX-License-Identifier: GPL-2.0 2 2 // Copyright (c) 2020 Cloudflare 3 3 #include <error.h> 4 + #include <netinet/tcp.h> 4 5 5 6 #include "test_progs.h" 6 7 #include "test_skmsg_load_helpers.skel.h"
+28
tools/testing/selftests/bpf/prog_tests/sockopt_sk.c
··· 2 2 #include <test_progs.h> 3 3 #include "cgroup_helpers.h" 4 4 5 + #include <linux/tcp.h> 6 + 7 + #ifndef SOL_TCP 8 + #define SOL_TCP IPPROTO_TCP 9 + #endif 10 + 5 11 #define SOL_CUSTOM 0xdeadbeef 6 12 7 13 static int getsetsockopt(void) ··· 17 11 char u8[4]; 18 12 __u32 u32; 19 13 char cc[16]; /* TCP_CA_NAME_MAX */ 14 + struct tcp_zerocopy_receive zc; 20 15 } buf = {}; 21 16 socklen_t optlen; 22 17 char *big_buf = NULL; ··· 158 151 if (strcmp(buf.cc, "cubic") != 0) { 159 152 log_err("Unexpected getsockopt(TCP_CONGESTION) %s != %s", 160 153 buf.cc, "cubic"); 154 + goto err; 155 + } 156 + 157 + /* TCP_ZEROCOPY_RECEIVE triggers */ 158 + memset(&buf, 0, sizeof(buf)); 159 + optlen = sizeof(buf.zc); 160 + err = getsockopt(fd, SOL_TCP, TCP_ZEROCOPY_RECEIVE, &buf, &optlen); 161 + if (err) { 162 + log_err("Unexpected getsockopt(TCP_ZEROCOPY_RECEIVE) err=%d errno=%d", 163 + err, errno); 164 + goto err; 165 + } 166 + 167 + memset(&buf, 0, sizeof(buf)); 168 + buf.zc.address = 12345; /* rejected by BPF */ 169 + optlen = sizeof(buf.zc); 170 + errno = 0; 171 + err = getsockopt(fd, SOL_TCP, TCP_ZEROCOPY_RECEIVE, &buf, &optlen); 172 + if (errno != EPERM) { 173 + log_err("Unexpected getsockopt(TCP_ZEROCOPY_RECEIVE) err=%d errno=%d", 174 + err, errno); 161 175 goto err; 162 176 } 163 177
+21 -2
tools/testing/selftests/bpf/progs/sockopt_sk.c
··· 1 1 // SPDX-License-Identifier: GPL-2.0 2 2 #include <string.h> 3 - #include <netinet/in.h> 4 - #include <netinet/tcp.h> 3 + #include <linux/tcp.h> 5 4 #include <linux/bpf.h> 5 + #include <netinet/in.h> 6 6 #include <bpf/bpf_helpers.h> 7 7 8 8 char _license[] SEC("license") = "GPL"; ··· 10 10 11 11 #ifndef PAGE_SIZE 12 12 #define PAGE_SIZE 4096 13 + #endif 14 + 15 + #ifndef SOL_TCP 16 + #define SOL_TCP IPPROTO_TCP 13 17 #endif 14 18 15 19 #define SOL_CUSTOM 0xdeadbeef ··· 58 54 * let next BPF program in the cgroup chain or kernel 59 55 * handle it. 60 56 */ 57 + return 1; 58 + } 59 + 60 + if (ctx->level == SOL_TCP && ctx->optname == TCP_ZEROCOPY_RECEIVE) { 61 + /* Verify that TCP_ZEROCOPY_RECEIVE triggers. 62 + * It has a custom implementation for performance 63 + * reasons. 64 + */ 65 + 66 + if (optval + sizeof(struct tcp_zerocopy_receive) > optval_end) 67 + return 0; /* EPERM, bounds check */ 68 + 69 + if (((struct tcp_zerocopy_receive *)optval)->address != 0) 70 + return 0; /* EPERM, unexpected data */ 71 + 61 72 return 1; 62 73 } 63 74
-1
tools/testing/selftests/bpf/test_progs.h
··· 16 16 #include <linux/if_packet.h> 17 17 #include <linux/ip.h> 18 18 #include <linux/ipv6.h> 19 - #include <netinet/tcp.h> 20 19 #include <linux/filter.h> 21 20 #include <linux/perf_event.h> 22 21 #include <linux/socket.h>