Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

net_tstamp: add SCM_TS_OPT_ID to provide OPT_ID in control message

SOF_TIMESTAMPING_OPT_ID socket option flag gives a way to correlate TX
timestamps and packets sent via socket. Unfortunately, there is no way
to reliably predict socket timestamp ID value in case of error returned
by sendmsg. For UDP sockets it's impossible because of lockless
nature of UDP transmit, several threads may send packets in parallel. In
case of RAW sockets MSG_MORE option makes things complicated. More
details are in the conversation [1].
This patch adds new control message type to give user-space
software an opportunity to control the mapping between packets and
values by providing ID with each sendmsg for UDP sockets.
The documentation is also added in this patch.

[1] https://lore.kernel.org/netdev/CALCETrU0jB+kg0mhV6A8mrHfTE1D1pr1SD_B9Eaa9aDPfgHdtA@mail.gmail.com/

Reviewed-by: Willem de Bruijn <willemb@google.com>
Reviewed-by: Jason Xing <kerneljasonxing@gmail.com>
Signed-off-by: Vadim Fedorenko <vadfed@meta.com>
Link: https://patch.msgid.link/20241001125716.2832769-2-vadfed@meta.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

authored by

Vadim Fedorenko and committed by
Jakub Kicinski
4aecca4c 34ea1df8

+75 -12
+14
Documentation/networking/timestamping.rst
··· 194 194 among all possibly concurrently outstanding timestamp requests for 195 195 that socket. 196 196 197 + The process can optionally override the default generated ID, by 198 + passing a specific ID with control message SCM_TS_OPT_ID (not 199 + supported for TCP sockets):: 200 + 201 + struct msghdr *msg; 202 + ... 203 + cmsg = CMSG_FIRSTHDR(msg); 204 + cmsg->cmsg_level = SOL_SOCKET; 205 + cmsg->cmsg_type = SCM_TS_OPT_ID; 206 + cmsg->cmsg_len = CMSG_LEN(sizeof(__u32)); 207 + *((__u32 *) CMSG_DATA(cmsg)) = opt_id; 208 + err = sendmsg(fd, msg, 0); 209 + 210 + 197 211 SOF_TIMESTAMPING_OPT_ID_TCP: 198 212 Pass this modifier along with SOF_TIMESTAMPING_OPT_ID for new TCP 199 213 timestamping applications. SOF_TIMESTAMPING_OPT_ID defines how the
+2
arch/alpha/include/uapi/asm/socket.h
··· 146 146 #define SCM_DEVMEM_DMABUF SO_DEVMEM_DMABUF 147 147 #define SO_DEVMEM_DONTNEED 80 148 148 149 + #define SCM_TS_OPT_ID 81 150 + 149 151 #if !defined(__KERNEL__) 150 152 151 153 #if __BITS_PER_LONG == 64
+2
arch/mips/include/uapi/asm/socket.h
··· 157 157 #define SCM_DEVMEM_DMABUF SO_DEVMEM_DMABUF 158 158 #define SO_DEVMEM_DONTNEED 80 159 159 160 + #define SCM_TS_OPT_ID 81 161 + 160 162 #if !defined(__KERNEL__) 161 163 162 164 #if __BITS_PER_LONG == 64
+2
arch/parisc/include/uapi/asm/socket.h
··· 138 138 #define SCM_DEVMEM_DMABUF SO_DEVMEM_DMABUF 139 139 #define SO_DEVMEM_DONTNEED 80 140 140 141 + #define SCM_TS_OPT_ID 0x404C 142 + 141 143 #if !defined(__KERNEL__) 142 144 143 145 #if __BITS_PER_LONG == 64
+2
arch/sparc/include/uapi/asm/socket.h
··· 139 139 #define SCM_DEVMEM_DMABUF SO_DEVMEM_DMABUF 140 140 #define SO_DEVMEM_DONTNEED 0x0059 141 141 142 + #define SCM_TS_OPT_ID 0x005a 143 + 142 144 #if !defined(__KERNEL__) 143 145 144 146
+3 -1
include/net/inet_sock.h
··· 174 174 __s16 tos; 175 175 char priority; 176 176 __u16 gso_size; 177 + u32 ts_opt_id; 177 178 u64 transmit_time; 178 179 u32 mark; 179 180 }; ··· 242 241 struct inet_cork_full cork; 243 242 }; 244 243 245 - #define IPCORK_OPT 1 /* ip-options has been held in ipcork.opt */ 244 + #define IPCORK_OPT 1 /* ip-options has been held in ipcork.opt */ 245 + #define IPCORK_TS_OPT_ID 2 /* ts_opt_id field is valid, overriding sk_tskey */ 246 246 247 247 enum { 248 248 INET_FLAGS_PKTINFO = 0,
+7
include/net/sock.h
··· 954 954 }; 955 955 956 956 #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)) 957 + /* 958 + * The highest bit of sk_tsflags is reserved for kernel-internal 959 + * SOCKCM_FLAG_TS_OPT_ID. There is a check in core/sock.c to control that 960 + * SOF_TIMESTAMPING* values do not reach this reserved area 961 + */ 962 + #define SOCKCM_FLAG_TS_OPT_ID BIT(31) 957 963 958 964 static inline void sock_copy_flags(struct sock *nsk, const struct sock *osk) 959 965 { ··· 1802 1796 u64 transmit_time; 1803 1797 u32 mark; 1804 1798 u32 tsflags; 1799 + u32 ts_opt_id; 1805 1800 }; 1806 1801 1807 1802 static inline void sockcm_init(struct sockcm_cookie *sockc,
+2
include/uapi/asm-generic/socket.h
··· 141 141 #define SCM_DEVMEM_DMABUF SO_DEVMEM_DMABUF 142 142 #define SO_DEVMEM_DONTNEED 80 143 143 144 + #define SCM_TS_OPT_ID 81 145 + 144 146 #if !defined(__KERNEL__) 145 147 146 148 #if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__))
+13
net/core/sock.c
··· 2899 2899 { 2900 2900 u32 tsflags; 2901 2901 2902 + BUILD_BUG_ON(SOF_TIMESTAMPING_LAST == (1 << 31)); 2903 + 2902 2904 switch (cmsg->cmsg_type) { 2903 2905 case SO_MARK: 2904 2906 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && ··· 2928 2926 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64))) 2929 2927 return -EINVAL; 2930 2928 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg)); 2929 + break; 2930 + case SCM_TS_OPT_ID: 2931 + if (sk_is_tcp(sk)) 2932 + return -EINVAL; 2933 + tsflags = READ_ONCE(sk->sk_tsflags); 2934 + if (!(tsflags & SOF_TIMESTAMPING_OPT_ID)) 2935 + return -EINVAL; 2936 + if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 2937 + return -EINVAL; 2938 + sockc->ts_opt_id = *(u32 *)CMSG_DATA(cmsg); 2939 + sockc->tsflags |= SOCKCM_FLAG_TS_OPT_ID; 2931 2940 break; 2932 2941 /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */ 2933 2942 case SCM_RIGHTS:
+14 -5
net/ipv4/ip_output.c
··· 973 973 unsigned int maxfraglen, fragheaderlen, maxnonfragsize; 974 974 int csummode = CHECKSUM_NONE; 975 975 struct rtable *rt = dst_rtable(cork->dst); 976 - bool paged, hold_tskey, extra_uref = false; 976 + bool paged, hold_tskey = false, extra_uref = false; 977 977 unsigned int wmem_alloc_delta = 0; 978 978 u32 tskey = 0; 979 979 ··· 1049 1049 1050 1050 cork->length += length; 1051 1051 1052 - hold_tskey = cork->tx_flags & SKBTX_ANY_TSTAMP && 1053 - READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID; 1054 - if (hold_tskey) 1055 - tskey = atomic_inc_return(&sk->sk_tskey) - 1; 1052 + if (cork->tx_flags & SKBTX_ANY_TSTAMP && 1053 + READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) { 1054 + if (cork->flags & IPCORK_TS_OPT_ID) { 1055 + tskey = cork->ts_opt_id; 1056 + } else { 1057 + tskey = atomic_inc_return(&sk->sk_tskey) - 1; 1058 + hold_tskey = true; 1059 + } 1060 + } 1056 1061 1057 1062 /* So, what's going on in the loop below? 1058 1063 * ··· 1332 1327 cork->transmit_time = ipc->sockc.transmit_time; 1333 1328 cork->tx_flags = 0; 1334 1329 sock_tx_timestamp(sk, ipc->sockc.tsflags, &cork->tx_flags); 1330 + if (ipc->sockc.tsflags & SOCKCM_FLAG_TS_OPT_ID) { 1331 + cork->flags |= IPCORK_TS_OPT_ID; 1332 + cork->ts_opt_id = ipc->sockc.ts_opt_id; 1333 + } 1335 1334 1336 1335 return 0; 1337 1336 }
+14 -6
net/ipv6/ip6_output.c
··· 1402 1402 cork->base.tx_flags = 0; 1403 1403 cork->base.mark = ipc6->sockc.mark; 1404 1404 sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags); 1405 - 1405 + if (ipc6->sockc.tsflags & SOCKCM_FLAG_TS_OPT_ID) { 1406 + cork->base.flags |= IPCORK_TS_OPT_ID; 1407 + cork->base.ts_opt_id = ipc6->sockc.ts_opt_id; 1408 + } 1406 1409 cork->base.length = 0; 1407 1410 cork->base.transmit_time = ipc6->sockc.transmit_time; 1408 1411 ··· 1436 1433 bool zc = false; 1437 1434 u32 tskey = 0; 1438 1435 struct rt6_info *rt = dst_rt6_info(cork->dst); 1439 - bool paged, hold_tskey, extra_uref = false; 1436 + bool paged, hold_tskey = false, extra_uref = false; 1440 1437 struct ipv6_txoptions *opt = v6_cork->opt; 1441 1438 int csummode = CHECKSUM_NONE; 1442 1439 unsigned int maxnonfragsize, headersize; ··· 1546 1543 flags &= ~MSG_SPLICE_PAGES; 1547 1544 } 1548 1545 1549 - hold_tskey = cork->tx_flags & SKBTX_ANY_TSTAMP && 1550 - READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID; 1551 - if (hold_tskey) 1552 - tskey = atomic_inc_return(&sk->sk_tskey) - 1; 1546 + if (cork->tx_flags & SKBTX_ANY_TSTAMP && 1547 + READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) { 1548 + if (cork->flags & IPCORK_TS_OPT_ID) { 1549 + tskey = cork->ts_opt_id; 1550 + } else { 1551 + tskey = atomic_inc_return(&sk->sk_tskey) - 1; 1552 + hold_tskey = true; 1553 + } 1554 + } 1553 1555 1554 1556 /* 1555 1557 * Let's try using as much space as possible.