Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

net: add real socket cookies

A long standing problem in netlink socket dumps is the use
of kernel socket addresses as cookies.

1) It is a security concern.

2) Sockets can be reused quite quickly, so there is
no guarantee a cookie is used once and identify
a flow.

3) request sock, establish sock, and timewait socks
for a given flow have different cookies.

Part of our effort to bring better TCP statistics requires
to switch to a different allocator.

In this patch, I chose to use a per network namespace 64bit generator,
and to use it only in the case a socket needs to be dumped to netlink.
(This might be refined later if needed)

Note that I tried to carry cookies from request sock, to establish sock,
then timewait sockets.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Eric Salo <salo@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

Eric Dumazet and committed by
David S. Miller
33cf7c90 654eff45

+55 -17
+2 -2
include/linux/sock_diag.h
··· 19 19 void sock_diag_register_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh)); 20 20 void sock_diag_unregister_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh)); 21 21 22 - int sock_diag_check_cookie(void *sk, const __u32 *cookie); 23 - void sock_diag_save_cookie(void *sk, __u32 *cookie); 22 + int sock_diag_check_cookie(struct sock *sk, const __u32 *cookie); 23 + void sock_diag_save_cookie(struct sock *sk, __u32 *cookie); 24 24 25 25 int sock_diag_put_meminfo(struct sock *sk, struct sk_buff *skb, int attr); 26 26 int sock_diag_put_filterinfo(bool may_report_filterinfo, struct sock *sk,
+2
include/net/inet_sock.h
··· 77 77 #define ir_v6_rmt_addr req.__req_common.skc_v6_daddr 78 78 #define ir_v6_loc_addr req.__req_common.skc_v6_rcv_saddr 79 79 #define ir_iif req.__req_common.skc_bound_dev_if 80 + #define ir_cookie req.__req_common.skc_cookie 81 + #define ireq_net req.__req_common.skc_net 80 82 81 83 kmemcheck_bitfield_begin(flags); 82 84 u16 snd_wscale : 4,
+1
include/net/inet_timewait_sock.h
··· 122 122 #define tw_v6_rcv_saddr __tw_common.skc_v6_rcv_saddr 123 123 #define tw_dport __tw_common.skc_dport 124 124 #define tw_num __tw_common.skc_num 125 + #define tw_cookie __tw_common.skc_cookie 125 126 126 127 int tw_timeout; 127 128 volatile unsigned char tw_substate;
+2
include/net/net_namespace.h
··· 56 56 #endif 57 57 spinlock_t rules_mod_lock; 58 58 59 + atomic64_t cookie_gen; 60 + 59 61 struct list_head list; /* list of network namespaces */ 60 62 struct list_head cleanup_list; /* namespaces on death row */ 61 63 struct list_head exit_list; /* Use only net_mutex */
+3
include/net/sock.h
··· 199 199 struct in6_addr skc_v6_rcv_saddr; 200 200 #endif 201 201 202 + atomic64_t skc_cookie; 203 + 202 204 /* 203 205 * fields between dontcopy_begin/dontcopy_end 204 206 * are not copied in sock_copy() ··· 331 329 #define sk_net __sk_common.skc_net 332 330 #define sk_v6_daddr __sk_common.skc_v6_daddr 333 331 #define sk_v6_rcv_saddr __sk_common.skc_v6_rcv_saddr 332 + #define sk_cookie __sk_common.skc_cookie 334 333 335 334 socket_lock_t sk_lock; 336 335 struct sk_buff_head sk_receive_queue;
+1
net/core/sock.c
··· 1538 1538 newsk->sk_err = 0; 1539 1539 newsk->sk_priority = 0; 1540 1540 newsk->sk_incoming_cpu = raw_smp_processor_id(); 1541 + atomic64_set(&newsk->sk_cookie, 0); 1541 1542 /* 1542 1543 * Before updating sk_refcnt, we must commit prior changes to memory 1543 1544 * (Documentation/RCU/rculist_nulls.txt for details)
+27 -10
net/core/sock_diag.c
··· 13 13 static int (*inet_rcv_compat)(struct sk_buff *skb, struct nlmsghdr *nlh); 14 14 static DEFINE_MUTEX(sock_diag_table_mutex); 15 15 16 - int sock_diag_check_cookie(void *sk, const __u32 *cookie) 16 + static u64 sock_gen_cookie(struct sock *sk) 17 17 { 18 - if ((cookie[0] != INET_DIAG_NOCOOKIE || 19 - cookie[1] != INET_DIAG_NOCOOKIE) && 20 - ((u32)(unsigned long)sk != cookie[0] || 21 - (u32)((((unsigned long)sk) >> 31) >> 1) != cookie[1])) 22 - return -ESTALE; 23 - else 18 + while (1) { 19 + u64 res = atomic64_read(&sk->sk_cookie); 20 + 21 + if (res) 22 + return res; 23 + res = atomic64_inc_return(&sock_net(sk)->cookie_gen); 24 + atomic64_cmpxchg(&sk->sk_cookie, 0, res); 25 + } 26 + } 27 + 28 + int sock_diag_check_cookie(struct sock *sk, const __u32 *cookie) 29 + { 30 + u64 res; 31 + 32 + if (cookie[0] == INET_DIAG_NOCOOKIE && cookie[1] == INET_DIAG_NOCOOKIE) 24 33 return 0; 34 + 35 + res = sock_gen_cookie(sk); 36 + if ((u32)res != cookie[0] || (u32)(res >> 32) != cookie[1]) 37 + return -ESTALE; 38 + 39 + return 0; 25 40 } 26 41 EXPORT_SYMBOL_GPL(sock_diag_check_cookie); 27 42 28 - void sock_diag_save_cookie(void *sk, __u32 *cookie) 43 + void sock_diag_save_cookie(struct sock *sk, __u32 *cookie) 29 44 { 30 - cookie[0] = (u32)(unsigned long)sk; 31 - cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1); 45 + u64 res = sock_gen_cookie(sk); 46 + 47 + cookie[0] = (u32)res; 48 + cookie[1] = (u32)(res >> 32); 32 49 } 33 50 EXPORT_SYMBOL_GPL(sock_diag_save_cookie); 34 51
+2
net/dccp/ipv4.c
··· 641 641 ireq = inet_rsk(req); 642 642 ireq->ir_loc_addr = ip_hdr(skb)->daddr; 643 643 ireq->ir_rmt_addr = ip_hdr(skb)->saddr; 644 + ireq->ireq_net = sock_net(sk); 645 + atomic64_set(&ireq->ir_cookie, 0); 644 646 645 647 /* 646 648 * Step 3: Process LISTEN state
+2
net/ipv4/inet_connection_sock.c
··· 678 678 newsk->sk_write_space = sk_stream_write_space; 679 679 680 680 newsk->sk_mark = inet_rsk(req)->ir_mark; 681 + atomic64_set(&newsk->sk_cookie, 682 + atomic64_read(&inet_rsk(req)->ir_cookie)); 681 683 682 684 newicsk->icsk_retransmits = 0; 683 685 newicsk->icsk_backoff = 0;
+9 -5
net/ipv4/inet_diag.c
··· 221 221 user_ns, portid, seq, nlmsg_flags, unlh); 222 222 } 223 223 224 - static int inet_twsk_diag_fill(struct inet_timewait_sock *tw, 224 + static int inet_twsk_diag_fill(struct sock *sk, 225 225 struct sk_buff *skb, 226 226 const struct inet_diag_req_v2 *req, 227 227 u32 portid, u32 seq, u16 nlmsg_flags, 228 228 const struct nlmsghdr *unlh) 229 229 { 230 + struct inet_timewait_sock *tw = inet_twsk(sk); 230 231 struct inet_diag_msg *r; 231 232 struct nlmsghdr *nlh; 232 233 s32 tmo; ··· 248 247 r->idiag_retrans = 0; 249 248 250 249 r->id.idiag_if = tw->tw_bound_dev_if; 251 - sock_diag_save_cookie(tw, r->id.idiag_cookie); 250 + sock_diag_save_cookie(sk, r->id.idiag_cookie); 252 251 253 252 r->id.idiag_sport = tw->tw_sport; 254 253 r->id.idiag_dport = tw->tw_dport; ··· 284 283 const struct nlmsghdr *unlh) 285 284 { 286 285 if (sk->sk_state == TCP_TIME_WAIT) 287 - return inet_twsk_diag_fill(inet_twsk(sk), skb, r, portid, seq, 286 + return inet_twsk_diag_fill(sk, skb, r, portid, seq, 288 287 nlmsg_flags, unlh); 289 288 290 289 return inet_csk_diag_fill(sk, skb, r, user_ns, portid, seq, ··· 676 675 if (!inet_diag_bc_sk(bc, sk)) 677 676 return 0; 678 677 679 - return inet_twsk_diag_fill(inet_twsk(sk), skb, r, 678 + return inet_twsk_diag_fill(sk, skb, r, 680 679 NETLINK_CB(cb->skb).portid, 681 680 cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); 682 681 } ··· 735 734 r->idiag_retrans = req->num_retrans; 736 735 737 736 r->id.idiag_if = sk->sk_bound_dev_if; 738 - sock_diag_save_cookie(req, r->id.idiag_cookie); 737 + 738 + BUILD_BUG_ON(offsetof(struct inet_request_sock, ir_cookie) != 739 + offsetof(struct sock, sk_cookie)); 740 + sock_diag_save_cookie((struct sock *)ireq, r->id.idiag_cookie); 739 741 740 742 tmo = req->expires - jiffies; 741 743 if (tmo < 0)
+1
net/ipv4/inet_timewait_sock.c
··· 195 195 tw->tw_ipv6only = 0; 196 196 tw->tw_transparent = inet->transparent; 197 197 tw->tw_prot = sk->sk_prot_creator; 198 + atomic64_set(&tw->tw_cookie, atomic64_read(&sk->sk_cookie)); 198 199 twsk_net_set(tw, hold_net(sock_net(sk))); 199 200 /* 200 201 * Because we use RCU lookups, we should not set tw_refcnt
+1
net/ipv4/syncookies.c
··· 346 346 req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0; 347 347 treq->snt_synack = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0; 348 348 treq->listener = NULL; 349 + ireq->ireq_net = sock_net(sk); 349 350 350 351 /* We throwed the options of the initial SYN away, so we hope 351 352 * the ACK carries the same options again (see RFC1122 4.2.3.8)
+2
net/ipv4/tcp_input.c
··· 5965 5965 5966 5966 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; 5967 5967 tcp_openreq_init(req, &tmp_opt, skb, sk); 5968 + inet_rsk(req)->ireq_net = sock_net(sk); 5969 + atomic64_set(&inet_rsk(req)->ir_cookie, 0); 5968 5970 5969 5971 af_ops->init_req(req, sk, skb); 5970 5972