Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

tcp/dccp: fix possible race __inet_lookup_established()

Michal Kubecek and Firo Yang did a very nice analysis of crashes
happening in __inet_lookup_established().

Since a TCP socket can go from TCP_ESTABLISH to TCP_LISTEN
(via a close()/socket()/listen() cycle) without a RCU grace period,
I should not have changed listeners linkage in their hash table.

They must use the nulls protocol (Documentation/RCU/rculist_nulls.txt),
so that a lookup can detect a socket in a hash list was moved in
another one.

Since we added code in commit d296ba60d8e2 ("soreuseport: Resolve
merge conflict for v4/v6 ordering fix"), we have to add
hlist_nulls_add_tail_rcu() helper.

Fixes: 3b24d854cb35 ("tcp/dccp: do not touch listener sk_refcnt under synflood")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Michal Kubecek <mkubecek@suse.cz>
Reported-by: Firo Yang <firo.yang@suse.com>
Reviewed-by: Michal Kubecek <mkubecek@suse.cz>
Link: https://lore.kernel.org/netdev/20191120083919.GH27852@unicorn.suse.cz/
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>

authored by

Eric Dumazet and committed by
Jakub Kicinski
8dbd76e7 8f9cc1ee

+65 -15
+37
include/linux/rculist_nulls.h
··· 101 101 } 102 102 103 103 /** 104 + * hlist_nulls_add_tail_rcu 105 + * @n: the element to add to the hash list. 106 + * @h: the list to add to. 107 + * 108 + * Description: 109 + * Adds the specified element to the specified hlist_nulls, 110 + * while permitting racing traversals. 111 + * 112 + * The caller must take whatever precautions are necessary 113 + * (such as holding appropriate locks) to avoid racing 114 + * with another list-mutation primitive, such as hlist_nulls_add_head_rcu() 115 + * or hlist_nulls_del_rcu(), running on this same list. 116 + * However, it is perfectly legal to run concurrently with 117 + * the _rcu list-traversal primitives, such as 118 + * hlist_nulls_for_each_entry_rcu(), used to prevent memory-consistency 119 + * problems on Alpha CPUs. Regardless of the type of CPU, the 120 + * list-traversal primitive must be guarded by rcu_read_lock(). 121 + */ 122 + static inline void hlist_nulls_add_tail_rcu(struct hlist_nulls_node *n, 123 + struct hlist_nulls_head *h) 124 + { 125 + struct hlist_nulls_node *i, *last = NULL; 126 + 127 + /* Note: write side code, so rcu accessors are not needed. */ 128 + for (i = h->first; !is_a_nulls(i); i = i->next) 129 + last = i; 130 + 131 + if (last) { 132 + n->next = last->next; 133 + n->pprev = &last->next; 134 + rcu_assign_pointer(hlist_next_rcu(last), n); 135 + } else { 136 + hlist_nulls_add_head_rcu(n, h); 137 + } 138 + } 139 + 140 + /** 104 141 * hlist_nulls_for_each_entry_rcu - iterate over rcu list of given type 105 142 * @tpos: the type * to use as a loop cursor. 106 143 * @pos: the &struct hlist_nulls_node to use as a loop cursor.
+9 -3
include/net/inet_hashtables.h
··· 103 103 struct hlist_head chain; 104 104 }; 105 105 106 - /* 107 - * Sockets can be hashed in established or listening table 106 + /* Sockets can be hashed in established or listening table. 107 + * We must use different 'nulls' end-of-chain value for all hash buckets : 108 + * A socket might transition from ESTABLISH to LISTEN state without 109 + * RCU grace period. A lookup in ehash table needs to handle this case. 108 110 */ 111 + #define LISTENING_NULLS_BASE (1U << 29) 109 112 struct inet_listen_hashbucket { 110 113 spinlock_t lock; 111 114 unsigned int count; 112 - struct hlist_head head; 115 + union { 116 + struct hlist_head head; 117 + struct hlist_nulls_head nulls_head; 118 + }; 113 119 }; 114 120 115 121 /* This is for listening sockets, thus all sockets which possess wildcards. */
+5
include/net/sock.h
··· 722 722 hlist_nulls_add_head_rcu(&sk->sk_nulls_node, list); 723 723 } 724 724 725 + static inline void __sk_nulls_add_node_tail_rcu(struct sock *sk, struct hlist_nulls_head *list) 726 + { 727 + hlist_nulls_add_tail_rcu(&sk->sk_nulls_node, list); 728 + } 729 + 725 730 static inline void sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list) 726 731 { 727 732 sock_hold(sk);
+2 -1
net/ipv4/inet_diag.c
··· 911 911 912 912 for (i = s_i; i < INET_LHTABLE_SIZE; i++) { 913 913 struct inet_listen_hashbucket *ilb; 914 + struct hlist_nulls_node *node; 914 915 915 916 num = 0; 916 917 ilb = &hashinfo->listening_hash[i]; 917 918 spin_lock(&ilb->lock); 918 - sk_for_each(sk, &ilb->head) { 919 + sk_nulls_for_each(sk, node, &ilb->nulls_head) { 919 920 struct inet_sock *inet = inet_sk(sk); 920 921 921 922 if (!net_eq(sock_net(sk), net))
+8 -8
net/ipv4/inet_hashtables.c
··· 516 516 struct inet_listen_hashbucket *ilb) 517 517 { 518 518 struct inet_bind_bucket *tb = inet_csk(sk)->icsk_bind_hash; 519 + const struct hlist_nulls_node *node; 519 520 struct sock *sk2; 520 521 kuid_t uid = sock_i_uid(sk); 521 522 522 - sk_for_each_rcu(sk2, &ilb->head) { 523 + sk_nulls_for_each_rcu(sk2, node, &ilb->nulls_head) { 523 524 if (sk2 != sk && 524 525 sk2->sk_family == sk->sk_family && 525 526 ipv6_only_sock(sk2) == ipv6_only_sock(sk) && ··· 556 555 } 557 556 if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport && 558 557 sk->sk_family == AF_INET6) 559 - hlist_add_tail_rcu(&sk->sk_node, &ilb->head); 558 + __sk_nulls_add_node_tail_rcu(sk, &ilb->nulls_head); 560 559 else 561 - hlist_add_head_rcu(&sk->sk_node, &ilb->head); 560 + __sk_nulls_add_node_rcu(sk, &ilb->nulls_head); 562 561 inet_hash2(hashinfo, sk); 563 562 ilb->count++; 564 563 sock_set_flag(sk, SOCK_RCU_FREE); ··· 607 606 reuseport_detach_sock(sk); 608 607 if (ilb) { 609 608 inet_unhash2(hashinfo, sk); 610 - __sk_del_node_init(sk); 611 - ilb->count--; 612 - } else { 613 - __sk_nulls_del_node_init_rcu(sk); 609 + ilb->count--; 614 610 } 611 + __sk_nulls_del_node_init_rcu(sk); 615 612 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 616 613 unlock: 617 614 spin_unlock_bh(lock); ··· 749 750 750 751 for (i = 0; i < INET_LHTABLE_SIZE; i++) { 751 752 spin_lock_init(&h->listening_hash[i].lock); 752 - INIT_HLIST_HEAD(&h->listening_hash[i].head); 753 + INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].nulls_head, 754 + i + LISTENING_NULLS_BASE); 753 755 h->listening_hash[i].count = 0; 754 756 } 755 757
+4 -3
net/ipv4/tcp_ipv4.c
··· 2147 2147 struct tcp_iter_state *st = seq->private; 2148 2148 struct net *net = seq_file_net(seq); 2149 2149 struct inet_listen_hashbucket *ilb; 2150 + struct hlist_nulls_node *node; 2150 2151 struct sock *sk = cur; 2151 2152 2152 2153 if (!sk) { 2153 2154 get_head: 2154 2155 ilb = &tcp_hashinfo.listening_hash[st->bucket]; 2155 2156 spin_lock(&ilb->lock); 2156 - sk = sk_head(&ilb->head); 2157 + sk = sk_nulls_head(&ilb->nulls_head); 2157 2158 st->offset = 0; 2158 2159 goto get_sk; 2159 2160 } ··· 2162 2161 ++st->num; 2163 2162 ++st->offset; 2164 2163 2165 - sk = sk_next(sk); 2164 + sk = sk_nulls_next(sk); 2166 2165 get_sk: 2167 - sk_for_each_from(sk) { 2166 + sk_nulls_for_each_from(sk, node) { 2168 2167 if (!net_eq(sock_net(sk), net)) 2169 2168 continue; 2170 2169 if (sk->sk_family == afinfo->family)