Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

net: Add a bhash2 table hashed by port and address

The current bind hashtable (bhash) is hashed by port only.
In the socket bind path, we have to check for bind conflicts by
traversing the specified port's inet_bind_bucket while holding the
hashbucket's spinlock (see inet_csk_get_port() and
inet_csk_bind_conflict()). In instances where there are tons of
sockets hashed to the same port at different addresses, the bind
conflict check is time-intensive and can cause softirq cpu lockups,
as well as stops new tcp connections since __inet_inherit_port()
also contests for the spinlock.

This patch adds a second bind table, bhash2, that hashes by
port and sk->sk_rcv_saddr (ipv4) and sk->sk_v6_rcv_saddr (ipv6).
Searching the bhash2 table leads to significantly faster conflict
resolution and less time holding the hashbucket spinlock.

Please note a few things:
* There can be the case where the a socket's address changes after it
has been bound. There are two cases where this happens:

1) The case where there is a bind() call on INADDR_ANY (ipv4) or
IPV6_ADDR_ANY (ipv6) and then a connect() call. The kernel will
assign the socket an address when it handles the connect()

2) In inet_sk_reselect_saddr(), which is called when rebuilding the
sk header and a few pre-conditions are met (eg rerouting fails).

In these two cases, we need to update the bhash2 table by removing the
entry for the old address, and add a new entry reflecting the updated
address.

* The bhash2 table must have its own lock, even though concurrent
accesses on the same port are protected by the bhash lock. Bhash2 must
have its own lock to protect against cases where sockets on different
ports hash to different bhash hashbuckets but to the same bhash2
hashbucket.

This brings up a few stipulations:
1) When acquiring both the bhash and the bhash2 lock, the bhash2 lock
will always be acquired after the bhash lock and released before the
bhash lock is released.

2) There are no nested bhash2 hashbucket locks. A bhash2 lock is always
acquired+released before another bhash2 lock is acquired+released.

* The bhash table cannot be superseded by the bhash2 table because for
bind requests on INADDR_ANY (ipv4) or IPV6_ADDR_ANY (ipv6), every socket
bound to that port must be checked for a potential conflict. The bhash
table is the only source of port->socket associations.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

authored by

Joanne Koong and committed by
Jakub Kicinski
28044fc1 0bf73255

+704 -98
+3
include/net/inet_connection_sock.h
··· 25 25 #undef INET_CSK_CLEAR_TIMERS 26 26 27 27 struct inet_bind_bucket; 28 + struct inet_bind2_bucket; 28 29 struct tcp_congestion_ops; 29 30 30 31 /* ··· 58 57 * 59 58 * @icsk_accept_queue: FIFO of established children 60 59 * @icsk_bind_hash: Bind node 60 + * @icsk_bind2_hash: Bind node in the bhash2 table 61 61 * @icsk_timeout: Timeout 62 62 * @icsk_retransmit_timer: Resend (no ack) 63 63 * @icsk_rto: Retransmit timeout ··· 85 83 struct inet_sock icsk_inet; 86 84 struct request_sock_queue icsk_accept_queue; 87 85 struct inet_bind_bucket *icsk_bind_hash; 86 + struct inet_bind2_bucket *icsk_bind2_hash; 88 87 unsigned long icsk_timeout; 89 88 struct timer_list icsk_retransmit_timer; 90 89 struct timer_list icsk_delack_timer;
+78 -2
include/net/inet_hashtables.h
··· 23 23 24 24 #include <net/inet_connection_sock.h> 25 25 #include <net/inet_sock.h> 26 + #include <net/ip.h> 26 27 #include <net/sock.h> 27 28 #include <net/route.h> 28 29 #include <net/tcp_states.h> ··· 91 90 struct hlist_head owners; 92 91 }; 93 92 94 - static inline struct net *ib_net(struct inet_bind_bucket *ib) 93 + struct inet_bind2_bucket { 94 + possible_net_t ib_net; 95 + int l3mdev; 96 + unsigned short port; 97 + union { 98 + #if IS_ENABLED(CONFIG_IPV6) 99 + struct in6_addr v6_rcv_saddr; 100 + #endif 101 + __be32 rcv_saddr; 102 + }; 103 + /* Node in the bhash2 inet_bind_hashbucket chain */ 104 + struct hlist_node node; 105 + /* List of sockets hashed to this bucket */ 106 + struct hlist_head owners; 107 + }; 108 + 109 + static inline struct net *ib_net(const struct inet_bind_bucket *ib) 110 + { 111 + return read_pnet(&ib->ib_net); 112 + } 113 + 114 + static inline struct net *ib2_net(const struct inet_bind2_bucket *ib) 95 115 { 96 116 return read_pnet(&ib->ib_net); 97 117 } ··· 155 133 * TCP hash as well as the others for fast bind/connect. 156 134 */ 157 135 struct kmem_cache *bind_bucket_cachep; 136 + /* This bind table is hashed by local port */ 158 137 struct inet_bind_hashbucket *bhash; 138 + struct kmem_cache *bind2_bucket_cachep; 139 + /* This bind table is hashed by local port and sk->sk_rcv_saddr (ipv4) 140 + * or sk->sk_v6_rcv_saddr (ipv6). This 2nd bind table is used 141 + * primarily for expediting bind conflict resolution. 142 + */ 143 + struct inet_bind_hashbucket *bhash2; 159 144 unsigned int bhash_size; 160 145 161 146 /* The 2nd listener table hashed by local port and address */ ··· 211 182 void inet_bind_bucket_destroy(struct kmem_cache *cachep, 212 183 struct inet_bind_bucket *tb); 213 184 185 + bool inet_bind_bucket_match(const struct inet_bind_bucket *tb, 186 + const struct net *net, unsigned short port, 187 + int l3mdev); 188 + 189 + struct inet_bind2_bucket * 190 + inet_bind2_bucket_create(struct kmem_cache *cachep, struct net *net, 191 + struct inet_bind_hashbucket *head, 192 + unsigned short port, int l3mdev, 193 + const struct sock *sk); 194 + 195 + void inet_bind2_bucket_destroy(struct kmem_cache *cachep, 196 + struct inet_bind2_bucket *tb); 197 + 198 + struct inet_bind2_bucket * 199 + inet_bind2_bucket_find(const struct inet_bind_hashbucket *head, 200 + const struct net *net, 201 + unsigned short port, int l3mdev, 202 + const struct sock *sk); 203 + 204 + bool inet_bind2_bucket_match_addr_any(const struct inet_bind2_bucket *tb, 205 + const struct net *net, unsigned short port, 206 + int l3mdev, const struct sock *sk); 207 + 214 208 static inline u32 inet_bhashfn(const struct net *net, const __u16 lport, 215 209 const u32 bhash_size) 216 210 { 217 211 return (lport + net_hash_mix(net)) & (bhash_size - 1); 218 212 } 219 213 214 + static inline struct inet_bind_hashbucket * 215 + inet_bhashfn_portaddr(const struct inet_hashinfo *hinfo, const struct sock *sk, 216 + const struct net *net, unsigned short port) 217 + { 218 + u32 hash; 219 + 220 + #if IS_ENABLED(CONFIG_IPV6) 221 + if (sk->sk_family == AF_INET6) 222 + hash = ipv6_portaddr_hash(net, &sk->sk_v6_rcv_saddr, port); 223 + else 224 + #endif 225 + hash = ipv4_portaddr_hash(net, sk->sk_rcv_saddr, port); 226 + return &hinfo->bhash2[hash & (hinfo->bhash_size - 1)]; 227 + } 228 + 229 + struct inet_bind_hashbucket * 230 + inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, int port); 231 + 232 + /* This should be called whenever a socket's sk_rcv_saddr (ipv4) or 233 + * sk_v6_rcv_saddr (ipv6) changes after it has been binded. The socket's 234 + * rcv_saddr field should already have been updated when this is called. 235 + */ 236 + int inet_bhash2_update_saddr(struct inet_bind_hashbucket *prev_saddr, struct sock *sk); 237 + 220 238 void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, 221 - const unsigned short snum); 239 + struct inet_bind2_bucket *tb2, unsigned short port); 222 240 223 241 /* Caller must disable local BH processing. */ 224 242 int __inet_inherit_port(const struct sock *sk, struct sock *child);
+14
include/net/sock.h
··· 348 348 * @sk_txtime_report_errors: set report errors mode for SO_TXTIME 349 349 * @sk_txtime_unused: unused txtime flags 350 350 * @ns_tracker: tracker for netns reference 351 + * @sk_bind2_node: bind node in the bhash2 table 351 352 */ 352 353 struct sock { 353 354 /* ··· 538 537 #endif 539 538 struct rcu_head sk_rcu; 540 539 netns_tracker ns_tracker; 540 + struct hlist_node sk_bind2_node; 541 541 }; 542 542 543 543 enum sk_pacing { ··· 872 870 hlist_add_head(&sk->sk_bind_node, list); 873 871 } 874 872 873 + static inline void __sk_del_bind2_node(struct sock *sk) 874 + { 875 + __hlist_del(&sk->sk_bind2_node); 876 + } 877 + 878 + static inline void sk_add_bind2_node(struct sock *sk, struct hlist_head *list) 879 + { 880 + hlist_add_head(&sk->sk_bind2_node, list); 881 + } 882 + 875 883 #define sk_for_each(__sk, list) \ 876 884 hlist_for_each_entry(__sk, list, sk_node) 877 885 #define sk_for_each_rcu(__sk, list) \ ··· 899 887 hlist_for_each_entry_safe(__sk, tmp, list, sk_node) 900 888 #define sk_for_each_bound(__sk, list) \ 901 889 hlist_for_each_entry(__sk, list, sk_bind_node) 890 + #define sk_for_each_bound_bhash2(__sk, list) \ 891 + hlist_for_each_entry(__sk, list, sk_bind2_node) 902 892 903 893 /** 904 894 * sk_for_each_entry_offset_rcu - iterate over a list at a given struct offset
+23 -2
net/dccp/ipv4.c
··· 45 45 int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) 46 46 { 47 47 const struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; 48 + struct inet_bind_hashbucket *prev_addr_hashbucket = NULL; 49 + __be32 daddr, nexthop, prev_sk_rcv_saddr; 48 50 struct inet_sock *inet = inet_sk(sk); 49 51 struct dccp_sock *dp = dccp_sk(sk); 50 52 __be16 orig_sport, orig_dport; 51 - __be32 daddr, nexthop; 52 53 struct flowi4 *fl4; 53 54 struct rtable *rt; 54 55 int err; ··· 90 89 if (inet_opt == NULL || !inet_opt->opt.srr) 91 90 daddr = fl4->daddr; 92 91 93 - if (inet->inet_saddr == 0) 92 + if (inet->inet_saddr == 0) { 93 + if (inet_csk(sk)->icsk_bind2_hash) { 94 + prev_addr_hashbucket = 95 + inet_bhashfn_portaddr(&dccp_hashinfo, sk, 96 + sock_net(sk), 97 + inet->inet_num); 98 + prev_sk_rcv_saddr = sk->sk_rcv_saddr; 99 + } 94 100 inet->inet_saddr = fl4->saddr; 101 + } 102 + 95 103 sk_rcv_saddr_set(sk, inet->inet_saddr); 104 + 105 + if (prev_addr_hashbucket) { 106 + err = inet_bhash2_update_saddr(prev_addr_hashbucket, sk); 107 + if (err) { 108 + inet->inet_saddr = 0; 109 + sk_rcv_saddr_set(sk, prev_sk_rcv_saddr); 110 + ip_rt_put(rt); 111 + return err; 112 + } 113 + } 114 + 96 115 inet->inet_dport = usin->sin_port; 97 116 sk_daddr_set(sk, daddr); 98 117
+18
net/dccp/ipv6.c
··· 934 934 } 935 935 936 936 if (saddr == NULL) { 937 + struct inet_bind_hashbucket *prev_addr_hashbucket = NULL; 938 + struct in6_addr prev_v6_rcv_saddr; 939 + 940 + if (icsk->icsk_bind2_hash) { 941 + prev_addr_hashbucket = inet_bhashfn_portaddr(&dccp_hashinfo, 942 + sk, sock_net(sk), 943 + inet->inet_num); 944 + prev_v6_rcv_saddr = sk->sk_v6_rcv_saddr; 945 + } 946 + 937 947 saddr = &fl6.saddr; 938 948 sk->sk_v6_rcv_saddr = *saddr; 949 + 950 + if (prev_addr_hashbucket) { 951 + err = inet_bhash2_update_saddr(prev_addr_hashbucket, sk); 952 + if (err) { 953 + sk->sk_v6_rcv_saddr = prev_v6_rcv_saddr; 954 + goto failure; 955 + } 956 + } 939 957 } 940 958 941 959 /* set the source address */
+29 -5
net/dccp/proto.c
··· 1120 1120 SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL); 1121 1121 if (!dccp_hashinfo.bind_bucket_cachep) 1122 1122 goto out_free_hashinfo2; 1123 + dccp_hashinfo.bind2_bucket_cachep = 1124 + kmem_cache_create("dccp_bind2_bucket", 1125 + sizeof(struct inet_bind2_bucket), 0, 1126 + SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL); 1127 + if (!dccp_hashinfo.bind2_bucket_cachep) 1128 + goto out_free_bind_bucket_cachep; 1123 1129 1124 1130 /* 1125 1131 * Size and allocate the main established and bind bucket ··· 1156 1150 1157 1151 if (!dccp_hashinfo.ehash) { 1158 1152 DCCP_CRIT("Failed to allocate DCCP established hash table"); 1159 - goto out_free_bind_bucket_cachep; 1153 + goto out_free_bind2_bucket_cachep; 1160 1154 } 1161 1155 1162 1156 for (i = 0; i <= dccp_hashinfo.ehash_mask; i++) ··· 1182 1176 goto out_free_dccp_locks; 1183 1177 } 1184 1178 1179 + dccp_hashinfo.bhash2 = (struct inet_bind_hashbucket *) 1180 + __get_free_pages(GFP_ATOMIC | __GFP_NOWARN, bhash_order); 1181 + 1182 + if (!dccp_hashinfo.bhash2) { 1183 + DCCP_CRIT("Failed to allocate DCCP bind2 hash table"); 1184 + goto out_free_dccp_bhash; 1185 + } 1186 + 1185 1187 for (i = 0; i < dccp_hashinfo.bhash_size; i++) { 1186 1188 spin_lock_init(&dccp_hashinfo.bhash[i].lock); 1187 1189 INIT_HLIST_HEAD(&dccp_hashinfo.bhash[i].chain); 1190 + spin_lock_init(&dccp_hashinfo.bhash2[i].lock); 1191 + INIT_HLIST_HEAD(&dccp_hashinfo.bhash2[i].chain); 1188 1192 } 1189 1193 1190 1194 rc = dccp_mib_init(); 1191 1195 if (rc) 1192 - goto out_free_dccp_bhash; 1196 + goto out_free_dccp_bhash2; 1193 1197 1194 1198 rc = dccp_ackvec_init(); 1195 1199 if (rc) ··· 1223 1207 dccp_ackvec_exit(); 1224 1208 out_free_dccp_mib: 1225 1209 dccp_mib_exit(); 1210 + out_free_dccp_bhash2: 1211 + free_pages((unsigned long)dccp_hashinfo.bhash2, bhash_order); 1226 1212 out_free_dccp_bhash: 1227 1213 free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order); 1228 1214 out_free_dccp_locks: 1229 1215 inet_ehash_locks_free(&dccp_hashinfo); 1230 1216 out_free_dccp_ehash: 1231 1217 free_pages((unsigned long)dccp_hashinfo.ehash, ehash_order); 1218 + out_free_bind2_bucket_cachep: 1219 + kmem_cache_destroy(dccp_hashinfo.bind2_bucket_cachep); 1232 1220 out_free_bind_bucket_cachep: 1233 1221 kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep); 1234 1222 out_free_hashinfo2: 1235 1223 inet_hashinfo2_free_mod(&dccp_hashinfo); 1236 1224 out_fail: 1237 1225 dccp_hashinfo.bhash = NULL; 1226 + dccp_hashinfo.bhash2 = NULL; 1238 1227 dccp_hashinfo.ehash = NULL; 1239 1228 dccp_hashinfo.bind_bucket_cachep = NULL; 1229 + dccp_hashinfo.bind2_bucket_cachep = NULL; 1240 1230 return rc; 1241 1231 } 1242 1232 1243 1233 static void __exit dccp_fini(void) 1244 1234 { 1235 + int bhash_order = get_order(dccp_hashinfo.bhash_size * 1236 + sizeof(struct inet_bind_hashbucket)); 1237 + 1245 1238 ccid_cleanup_builtins(); 1246 1239 dccp_mib_exit(); 1247 - free_pages((unsigned long)dccp_hashinfo.bhash, 1248 - get_order(dccp_hashinfo.bhash_size * 1249 - sizeof(struct inet_bind_hashbucket))); 1240 + free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order); 1241 + free_pages((unsigned long)dccp_hashinfo.bhash2, bhash_order); 1250 1242 free_pages((unsigned long)dccp_hashinfo.ehash, 1251 1243 get_order((dccp_hashinfo.ehash_mask + 1) * 1252 1244 sizeof(struct inet_ehash_bucket)));
+21 -5
net/ipv4/af_inet.c
··· 1219 1219 1220 1220 static int inet_sk_reselect_saddr(struct sock *sk) 1221 1221 { 1222 + struct inet_bind_hashbucket *prev_addr_hashbucket; 1222 1223 struct inet_sock *inet = inet_sk(sk); 1223 1224 __be32 old_saddr = inet->inet_saddr; 1224 1225 __be32 daddr = inet->inet_daddr; ··· 1227 1226 struct rtable *rt; 1228 1227 __be32 new_saddr; 1229 1228 struct ip_options_rcu *inet_opt; 1229 + int err; 1230 1230 1231 1231 inet_opt = rcu_dereference_protected(inet->inet_opt, 1232 1232 lockdep_sock_is_held(sk)); ··· 1242 1240 if (IS_ERR(rt)) 1243 1241 return PTR_ERR(rt); 1244 1242 1245 - sk_setup_caps(sk, &rt->dst); 1246 - 1247 1243 new_saddr = fl4->saddr; 1248 1244 1249 - if (new_saddr == old_saddr) 1245 + if (new_saddr == old_saddr) { 1246 + sk_setup_caps(sk, &rt->dst); 1250 1247 return 0; 1248 + } 1249 + 1250 + prev_addr_hashbucket = 1251 + inet_bhashfn_portaddr(sk->sk_prot->h.hashinfo, sk, 1252 + sock_net(sk), inet->inet_num); 1253 + 1254 + inet->inet_saddr = inet->inet_rcv_saddr = new_saddr; 1255 + 1256 + err = inet_bhash2_update_saddr(prev_addr_hashbucket, sk); 1257 + if (err) { 1258 + inet->inet_saddr = old_saddr; 1259 + inet->inet_rcv_saddr = old_saddr; 1260 + ip_rt_put(rt); 1261 + return err; 1262 + } 1263 + 1264 + sk_setup_caps(sk, &rt->dst); 1251 1265 1252 1266 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_ip_dynaddr) > 1) { 1253 1267 pr_info("%s(): shifting inet->saddr from %pI4 to %pI4\n", 1254 1268 __func__, &old_saddr, &new_saddr); 1255 1269 } 1256 - 1257 - inet->inet_saddr = inet->inet_rcv_saddr = new_saddr; 1258 1270 1259 1271 /* 1260 1272 * XXX The only one ugly spot where we need to
+214 -67
net/ipv4/inet_connection_sock.c
··· 130 130 } 131 131 EXPORT_SYMBOL(inet_get_local_port_range); 132 132 133 - static int inet_csk_bind_conflict(const struct sock *sk, 134 - const struct inet_bind_bucket *tb, 135 - bool relax, bool reuseport_ok) 133 + static bool inet_use_bhash2_on_bind(const struct sock *sk) 134 + { 135 + #if IS_ENABLED(CONFIG_IPV6) 136 + if (sk->sk_family == AF_INET6) { 137 + int addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr); 138 + 139 + return addr_type != IPV6_ADDR_ANY && 140 + addr_type != IPV6_ADDR_MAPPED; 141 + } 142 + #endif 143 + return sk->sk_rcv_saddr != htonl(INADDR_ANY); 144 + } 145 + 146 + static bool inet_bind_conflict(const struct sock *sk, struct sock *sk2, 147 + kuid_t sk_uid, bool relax, 148 + bool reuseport_cb_ok, bool reuseport_ok) 149 + { 150 + int bound_dev_if2; 151 + 152 + if (sk == sk2) 153 + return false; 154 + 155 + bound_dev_if2 = READ_ONCE(sk2->sk_bound_dev_if); 156 + 157 + if (!sk->sk_bound_dev_if || !bound_dev_if2 || 158 + sk->sk_bound_dev_if == bound_dev_if2) { 159 + if (sk->sk_reuse && sk2->sk_reuse && 160 + sk2->sk_state != TCP_LISTEN) { 161 + if (!relax || (!reuseport_ok && sk->sk_reuseport && 162 + sk2->sk_reuseport && reuseport_cb_ok && 163 + (sk2->sk_state == TCP_TIME_WAIT || 164 + uid_eq(sk_uid, sock_i_uid(sk2))))) 165 + return true; 166 + } else if (!reuseport_ok || !sk->sk_reuseport || 167 + !sk2->sk_reuseport || !reuseport_cb_ok || 168 + (sk2->sk_state != TCP_TIME_WAIT && 169 + !uid_eq(sk_uid, sock_i_uid(sk2)))) { 170 + return true; 171 + } 172 + } 173 + return false; 174 + } 175 + 176 + static bool inet_bhash2_conflict(const struct sock *sk, 177 + const struct inet_bind2_bucket *tb2, 178 + kuid_t sk_uid, 179 + bool relax, bool reuseport_cb_ok, 180 + bool reuseport_ok) 136 181 { 137 182 struct sock *sk2; 183 + 184 + sk_for_each_bound_bhash2(sk2, &tb2->owners) { 185 + if (sk->sk_family == AF_INET && ipv6_only_sock(sk2)) 186 + continue; 187 + 188 + if (inet_bind_conflict(sk, sk2, sk_uid, relax, 189 + reuseport_cb_ok, reuseport_ok)) 190 + return true; 191 + } 192 + return false; 193 + } 194 + 195 + /* This should be called only when the tb and tb2 hashbuckets' locks are held */ 196 + static int inet_csk_bind_conflict(const struct sock *sk, 197 + const struct inet_bind_bucket *tb, 198 + const struct inet_bind2_bucket *tb2, /* may be null */ 199 + bool relax, bool reuseport_ok) 200 + { 138 201 bool reuseport_cb_ok; 139 - bool reuse = sk->sk_reuse; 140 - bool reuseport = !!sk->sk_reuseport; 141 202 struct sock_reuseport *reuseport_cb; 142 203 kuid_t uid = sock_i_uid((struct sock *)sk); 143 204 ··· 211 150 /* 212 151 * Unlike other sk lookup places we do not check 213 152 * for sk_net here, since _all_ the socks listed 214 - * in tb->owners list belong to the same net - the 215 - * one this bucket belongs to. 153 + * in tb->owners and tb2->owners list belong 154 + * to the same net - the one this bucket belongs to. 216 155 */ 217 156 218 - sk_for_each_bound(sk2, &tb->owners) { 219 - int bound_dev_if2; 157 + if (!inet_use_bhash2_on_bind(sk)) { 158 + struct sock *sk2; 220 159 221 - if (sk == sk2) 222 - continue; 223 - bound_dev_if2 = READ_ONCE(sk2->sk_bound_dev_if); 224 - if ((!sk->sk_bound_dev_if || 225 - !bound_dev_if2 || 226 - sk->sk_bound_dev_if == bound_dev_if2)) { 227 - if (reuse && sk2->sk_reuse && 228 - sk2->sk_state != TCP_LISTEN) { 229 - if ((!relax || 230 - (!reuseport_ok && 231 - reuseport && sk2->sk_reuseport && 232 - reuseport_cb_ok && 233 - (sk2->sk_state == TCP_TIME_WAIT || 234 - uid_eq(uid, sock_i_uid(sk2))))) && 235 - inet_rcv_saddr_equal(sk, sk2, true)) 236 - break; 237 - } else if (!reuseport_ok || 238 - !reuseport || !sk2->sk_reuseport || 239 - !reuseport_cb_ok || 240 - (sk2->sk_state != TCP_TIME_WAIT && 241 - !uid_eq(uid, sock_i_uid(sk2)))) { 242 - if (inet_rcv_saddr_equal(sk, sk2, true)) 243 - break; 244 - } 245 - } 160 + sk_for_each_bound(sk2, &tb->owners) 161 + if (inet_bind_conflict(sk, sk2, uid, relax, 162 + reuseport_cb_ok, reuseport_ok) && 163 + inet_rcv_saddr_equal(sk, sk2, true)) 164 + return true; 165 + 166 + return false; 246 167 } 247 - return sk2 != NULL; 168 + 169 + /* Conflicts with an existing IPV6_ADDR_ANY (if ipv6) or INADDR_ANY (if 170 + * ipv4) should have been checked already. We need to do these two 171 + * checks separately because their spinlocks have to be acquired/released 172 + * independently of each other, to prevent possible deadlocks 173 + */ 174 + return tb2 && inet_bhash2_conflict(sk, tb2, uid, relax, reuseport_cb_ok, 175 + reuseport_ok); 176 + } 177 + 178 + /* Determine if there is a bind conflict with an existing IPV6_ADDR_ANY (if ipv6) or 179 + * INADDR_ANY (if ipv4) socket. 180 + * 181 + * Caller must hold bhash hashbucket lock with local bh disabled, to protect 182 + * against concurrent binds on the port for addr any 183 + */ 184 + static bool inet_bhash2_addr_any_conflict(const struct sock *sk, int port, int l3mdev, 185 + bool relax, bool reuseport_ok) 186 + { 187 + kuid_t uid = sock_i_uid((struct sock *)sk); 188 + const struct net *net = sock_net(sk); 189 + struct sock_reuseport *reuseport_cb; 190 + struct inet_bind_hashbucket *head2; 191 + struct inet_bind2_bucket *tb2; 192 + bool reuseport_cb_ok; 193 + 194 + rcu_read_lock(); 195 + reuseport_cb = rcu_dereference(sk->sk_reuseport_cb); 196 + /* paired with WRITE_ONCE() in __reuseport_(add|detach)_closed_sock */ 197 + reuseport_cb_ok = !reuseport_cb || READ_ONCE(reuseport_cb->num_closed_socks); 198 + rcu_read_unlock(); 199 + 200 + head2 = inet_bhash2_addr_any_hashbucket(sk, net, port); 201 + 202 + spin_lock(&head2->lock); 203 + 204 + inet_bind_bucket_for_each(tb2, &head2->chain) 205 + if (inet_bind2_bucket_match_addr_any(tb2, net, port, l3mdev, sk)) 206 + break; 207 + 208 + if (tb2 && inet_bhash2_conflict(sk, tb2, uid, relax, reuseport_cb_ok, 209 + reuseport_ok)) { 210 + spin_unlock(&head2->lock); 211 + return true; 212 + } 213 + 214 + spin_unlock(&head2->lock); 215 + return false; 248 216 } 249 217 250 218 /* 251 219 * Find an open port number for the socket. Returns with the 252 - * inet_bind_hashbucket lock held. 220 + * inet_bind_hashbucket locks held if successful. 253 221 */ 254 222 static struct inet_bind_hashbucket * 255 - inet_csk_find_open_port(struct sock *sk, struct inet_bind_bucket **tb_ret, int *port_ret) 223 + inet_csk_find_open_port(const struct sock *sk, struct inet_bind_bucket **tb_ret, 224 + struct inet_bind2_bucket **tb2_ret, 225 + struct inet_bind_hashbucket **head2_ret, int *port_ret) 256 226 { 257 227 struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo; 258 228 int port = 0; 259 - struct inet_bind_hashbucket *head; 229 + struct inet_bind_hashbucket *head, *head2; 260 230 struct net *net = sock_net(sk); 261 231 bool relax = false; 262 232 int i, low, high, attempt_half; 233 + struct inet_bind2_bucket *tb2; 263 234 struct inet_bind_bucket *tb; 264 235 u32 remaining, offset; 265 236 int l3mdev; ··· 332 239 head = &hinfo->bhash[inet_bhashfn(net, port, 333 240 hinfo->bhash_size)]; 334 241 spin_lock_bh(&head->lock); 242 + if (inet_use_bhash2_on_bind(sk)) { 243 + if (inet_bhash2_addr_any_conflict(sk, port, l3mdev, relax, false)) 244 + goto next_port; 245 + } 246 + 247 + head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); 248 + spin_lock(&head2->lock); 249 + tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk); 335 250 inet_bind_bucket_for_each(tb, &head->chain) 336 - if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev && 337 - tb->port == port) { 338 - if (!inet_csk_bind_conflict(sk, tb, relax, false)) 251 + if (inet_bind_bucket_match(tb, net, port, l3mdev)) { 252 + if (!inet_csk_bind_conflict(sk, tb, tb2, 253 + relax, false)) 339 254 goto success; 255 + spin_unlock(&head2->lock); 340 256 goto next_port; 341 257 } 342 258 tb = NULL; ··· 374 272 success: 375 273 *port_ret = port; 376 274 *tb_ret = tb; 275 + *tb2_ret = tb2; 276 + *head2_ret = head2; 377 277 return head; 378 278 } 379 279 ··· 472 368 bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN; 473 369 struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo; 474 370 int ret = 1, port = snum; 475 - struct inet_bind_hashbucket *head; 476 371 struct net *net = sock_net(sk); 372 + bool found_port = false, check_bind_conflict = true; 373 + bool bhash_created = false, bhash2_created = false; 374 + struct inet_bind_hashbucket *head, *head2; 375 + struct inet_bind2_bucket *tb2 = NULL; 477 376 struct inet_bind_bucket *tb = NULL; 377 + bool head2_lock_acquired = false; 478 378 int l3mdev; 479 379 480 380 l3mdev = inet_sk_bound_l3mdev(sk); 481 381 482 382 if (!port) { 483 - head = inet_csk_find_open_port(sk, &tb, &port); 383 + head = inet_csk_find_open_port(sk, &tb, &tb2, &head2, &port); 484 384 if (!head) 485 385 return ret; 486 - if (!tb) 487 - goto tb_not_found; 488 - goto success; 489 - } 490 - head = &hinfo->bhash[inet_bhashfn(net, port, 491 - hinfo->bhash_size)]; 492 - spin_lock_bh(&head->lock); 493 - inet_bind_bucket_for_each(tb, &head->chain) 494 - if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev && 495 - tb->port == port) 496 - goto tb_found; 497 - tb_not_found: 498 - tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, 499 - net, head, port, l3mdev); 500 - if (!tb) 501 - goto fail_unlock; 502 - tb_found: 503 - if (!hlist_empty(&tb->owners)) { 504 - if (sk->sk_reuse == SK_FORCE_REUSE) 505 - goto success; 506 386 507 - if ((tb->fastreuse > 0 && reuse) || 508 - sk_reuseport_match(tb, sk)) 387 + head2_lock_acquired = true; 388 + 389 + if (tb && tb2) 509 390 goto success; 510 - if (inet_csk_bind_conflict(sk, tb, true, true)) 391 + found_port = true; 392 + } else { 393 + head = &hinfo->bhash[inet_bhashfn(net, port, 394 + hinfo->bhash_size)]; 395 + spin_lock_bh(&head->lock); 396 + inet_bind_bucket_for_each(tb, &head->chain) 397 + if (inet_bind_bucket_match(tb, net, port, l3mdev)) 398 + break; 399 + } 400 + 401 + if (!tb) { 402 + tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, net, 403 + head, port, l3mdev); 404 + if (!tb) 405 + goto fail_unlock; 406 + bhash_created = true; 407 + } 408 + 409 + if (!found_port) { 410 + if (!hlist_empty(&tb->owners)) { 411 + if (sk->sk_reuse == SK_FORCE_REUSE || 412 + (tb->fastreuse > 0 && reuse) || 413 + sk_reuseport_match(tb, sk)) 414 + check_bind_conflict = false; 415 + } 416 + 417 + if (check_bind_conflict && inet_use_bhash2_on_bind(sk)) { 418 + if (inet_bhash2_addr_any_conflict(sk, port, l3mdev, true, true)) 419 + goto fail_unlock; 420 + } 421 + 422 + head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); 423 + spin_lock(&head2->lock); 424 + head2_lock_acquired = true; 425 + tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk); 426 + } 427 + 428 + if (!tb2) { 429 + tb2 = inet_bind2_bucket_create(hinfo->bind2_bucket_cachep, 430 + net, head2, port, l3mdev, sk); 431 + if (!tb2) 432 + goto fail_unlock; 433 + bhash2_created = true; 434 + } 435 + 436 + if (!found_port && check_bind_conflict) { 437 + if (inet_csk_bind_conflict(sk, tb, tb2, true, true)) 511 438 goto fail_unlock; 512 439 } 440 + 513 441 success: 514 442 inet_csk_update_fastreuse(tb, sk); 515 443 516 444 if (!inet_csk(sk)->icsk_bind_hash) 517 - inet_bind_hash(sk, tb, port); 445 + inet_bind_hash(sk, tb, tb2, port); 518 446 WARN_ON(inet_csk(sk)->icsk_bind_hash != tb); 447 + WARN_ON(inet_csk(sk)->icsk_bind2_hash != tb2); 519 448 ret = 0; 520 449 521 450 fail_unlock: 451 + if (ret) { 452 + if (bhash_created) 453 + inet_bind_bucket_destroy(hinfo->bind_bucket_cachep, tb); 454 + if (bhash2_created) 455 + inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, 456 + tb2); 457 + } 458 + if (head2_lock_acquired) 459 + spin_unlock(&head2->lock); 522 460 spin_unlock_bh(&head->lock); 523 461 return ret; 524 462 } ··· 1108 962 1109 963 inet_sk_set_state(newsk, TCP_SYN_RECV); 1110 964 newicsk->icsk_bind_hash = NULL; 965 + newicsk->icsk_bind2_hash = NULL; 1111 966 1112 967 inet_sk(newsk)->inet_dport = inet_rsk(req)->ir_rmt_port; 1113 968 inet_sk(newsk)->inet_num = inet_rsk(req)->ir_num;
+256 -14
net/ipv4/inet_hashtables.c
··· 92 92 } 93 93 } 94 94 95 - void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, 96 - const unsigned short snum) 95 + bool inet_bind_bucket_match(const struct inet_bind_bucket *tb, const struct net *net, 96 + unsigned short port, int l3mdev) 97 97 { 98 - inet_sk(sk)->inet_num = snum; 98 + return net_eq(ib_net(tb), net) && tb->port == port && 99 + tb->l3mdev == l3mdev; 100 + } 101 + 102 + static void inet_bind2_bucket_init(struct inet_bind2_bucket *tb, 103 + struct net *net, 104 + struct inet_bind_hashbucket *head, 105 + unsigned short port, int l3mdev, 106 + const struct sock *sk) 107 + { 108 + write_pnet(&tb->ib_net, net); 109 + tb->l3mdev = l3mdev; 110 + tb->port = port; 111 + #if IS_ENABLED(CONFIG_IPV6) 112 + if (sk->sk_family == AF_INET6) 113 + tb->v6_rcv_saddr = sk->sk_v6_rcv_saddr; 114 + else 115 + #endif 116 + tb->rcv_saddr = sk->sk_rcv_saddr; 117 + INIT_HLIST_HEAD(&tb->owners); 118 + hlist_add_head(&tb->node, &head->chain); 119 + } 120 + 121 + struct inet_bind2_bucket *inet_bind2_bucket_create(struct kmem_cache *cachep, 122 + struct net *net, 123 + struct inet_bind_hashbucket *head, 124 + unsigned short port, 125 + int l3mdev, 126 + const struct sock *sk) 127 + { 128 + struct inet_bind2_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC); 129 + 130 + if (tb) 131 + inet_bind2_bucket_init(tb, net, head, port, l3mdev, sk); 132 + 133 + return tb; 134 + } 135 + 136 + /* Caller must hold hashbucket lock for this tb with local BH disabled */ 137 + void inet_bind2_bucket_destroy(struct kmem_cache *cachep, struct inet_bind2_bucket *tb) 138 + { 139 + if (hlist_empty(&tb->owners)) { 140 + __hlist_del(&tb->node); 141 + kmem_cache_free(cachep, tb); 142 + } 143 + } 144 + 145 + static bool inet_bind2_bucket_addr_match(const struct inet_bind2_bucket *tb2, 146 + const struct sock *sk) 147 + { 148 + #if IS_ENABLED(CONFIG_IPV6) 149 + if (sk->sk_family == AF_INET6) 150 + return ipv6_addr_equal(&tb2->v6_rcv_saddr, 151 + &sk->sk_v6_rcv_saddr); 152 + #endif 153 + return tb2->rcv_saddr == sk->sk_rcv_saddr; 154 + } 155 + 156 + void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, 157 + struct inet_bind2_bucket *tb2, unsigned short port) 158 + { 159 + inet_sk(sk)->inet_num = port; 99 160 sk_add_bind_node(sk, &tb->owners); 100 161 inet_csk(sk)->icsk_bind_hash = tb; 162 + sk_add_bind2_node(sk, &tb2->owners); 163 + inet_csk(sk)->icsk_bind2_hash = tb2; 101 164 } 102 165 103 166 /* ··· 172 109 const int bhash = inet_bhashfn(sock_net(sk), inet_sk(sk)->inet_num, 173 110 hashinfo->bhash_size); 174 111 struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash]; 112 + struct inet_bind_hashbucket *head2 = 113 + inet_bhashfn_portaddr(hashinfo, sk, sock_net(sk), 114 + inet_sk(sk)->inet_num); 175 115 struct inet_bind_bucket *tb; 176 116 177 117 spin_lock(&head->lock); ··· 183 117 inet_csk(sk)->icsk_bind_hash = NULL; 184 118 inet_sk(sk)->inet_num = 0; 185 119 inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); 120 + 121 + spin_lock(&head2->lock); 122 + if (inet_csk(sk)->icsk_bind2_hash) { 123 + struct inet_bind2_bucket *tb2 = inet_csk(sk)->icsk_bind2_hash; 124 + 125 + __sk_del_bind2_node(sk); 126 + inet_csk(sk)->icsk_bind2_hash = NULL; 127 + inet_bind2_bucket_destroy(hashinfo->bind2_bucket_cachep, tb2); 128 + } 129 + spin_unlock(&head2->lock); 130 + 186 131 spin_unlock(&head->lock); 187 132 } 188 133 ··· 212 135 const int bhash = inet_bhashfn(sock_net(sk), port, 213 136 table->bhash_size); 214 137 struct inet_bind_hashbucket *head = &table->bhash[bhash]; 138 + struct inet_bind_hashbucket *head2 = 139 + inet_bhashfn_portaddr(table, child, sock_net(sk), port); 140 + bool created_inet_bind_bucket = false; 141 + bool update_fastreuse = false; 142 + struct net *net = sock_net(sk); 143 + struct inet_bind2_bucket *tb2; 215 144 struct inet_bind_bucket *tb; 216 145 int l3mdev; 217 146 218 147 spin_lock(&head->lock); 148 + spin_lock(&head2->lock); 219 149 tb = inet_csk(sk)->icsk_bind_hash; 220 - if (unlikely(!tb)) { 150 + tb2 = inet_csk(sk)->icsk_bind2_hash; 151 + if (unlikely(!tb || !tb2)) { 152 + spin_unlock(&head2->lock); 221 153 spin_unlock(&head->lock); 222 154 return -ENOENT; 223 155 } ··· 239 153 * as that of the child socket. We have to look up or 240 154 * create a new bind bucket for the child here. */ 241 155 inet_bind_bucket_for_each(tb, &head->chain) { 242 - if (net_eq(ib_net(tb), sock_net(sk)) && 243 - tb->l3mdev == l3mdev && tb->port == port) 156 + if (inet_bind_bucket_match(tb, net, port, l3mdev)) 244 157 break; 245 158 } 246 159 if (!tb) { 247 160 tb = inet_bind_bucket_create(table->bind_bucket_cachep, 248 - sock_net(sk), head, port, 249 - l3mdev); 161 + net, head, port, l3mdev); 250 162 if (!tb) { 163 + spin_unlock(&head2->lock); 251 164 spin_unlock(&head->lock); 252 165 return -ENOMEM; 253 166 } 167 + created_inet_bind_bucket = true; 254 168 } 255 - inet_csk_update_fastreuse(tb, child); 169 + update_fastreuse = true; 170 + 171 + goto bhash2_find; 172 + } else if (!inet_bind2_bucket_addr_match(tb2, child)) { 173 + l3mdev = inet_sk_bound_l3mdev(sk); 174 + 175 + bhash2_find: 176 + tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, child); 177 + if (!tb2) { 178 + tb2 = inet_bind2_bucket_create(table->bind2_bucket_cachep, 179 + net, head2, port, 180 + l3mdev, child); 181 + if (!tb2) 182 + goto error; 183 + } 256 184 } 257 - inet_bind_hash(child, tb, port); 185 + if (update_fastreuse) 186 + inet_csk_update_fastreuse(tb, child); 187 + inet_bind_hash(child, tb, tb2, port); 188 + spin_unlock(&head2->lock); 258 189 spin_unlock(&head->lock); 259 190 260 191 return 0; 192 + 193 + error: 194 + if (created_inet_bind_bucket) 195 + inet_bind_bucket_destroy(table->bind_bucket_cachep, tb); 196 + spin_unlock(&head2->lock); 197 + spin_unlock(&head->lock); 198 + return -ENOMEM; 261 199 } 262 200 EXPORT_SYMBOL_GPL(__inet_inherit_port); 263 201 ··· 785 675 } 786 676 EXPORT_SYMBOL_GPL(inet_unhash); 787 677 678 + static bool inet_bind2_bucket_match(const struct inet_bind2_bucket *tb, 679 + const struct net *net, unsigned short port, 680 + int l3mdev, const struct sock *sk) 681 + { 682 + #if IS_ENABLED(CONFIG_IPV6) 683 + if (sk->sk_family == AF_INET6) 684 + return net_eq(ib2_net(tb), net) && tb->port == port && 685 + tb->l3mdev == l3mdev && 686 + ipv6_addr_equal(&tb->v6_rcv_saddr, &sk->sk_v6_rcv_saddr); 687 + else 688 + #endif 689 + return net_eq(ib2_net(tb), net) && tb->port == port && 690 + tb->l3mdev == l3mdev && tb->rcv_saddr == sk->sk_rcv_saddr; 691 + } 692 + 693 + bool inet_bind2_bucket_match_addr_any(const struct inet_bind2_bucket *tb, const struct net *net, 694 + unsigned short port, int l3mdev, const struct sock *sk) 695 + { 696 + #if IS_ENABLED(CONFIG_IPV6) 697 + struct in6_addr addr_any = {}; 698 + 699 + if (sk->sk_family == AF_INET6) 700 + return net_eq(ib2_net(tb), net) && tb->port == port && 701 + tb->l3mdev == l3mdev && 702 + ipv6_addr_equal(&tb->v6_rcv_saddr, &addr_any); 703 + else 704 + #endif 705 + return net_eq(ib2_net(tb), net) && tb->port == port && 706 + tb->l3mdev == l3mdev && tb->rcv_saddr == 0; 707 + } 708 + 709 + /* The socket's bhash2 hashbucket spinlock must be held when this is called */ 710 + struct inet_bind2_bucket * 711 + inet_bind2_bucket_find(const struct inet_bind_hashbucket *head, const struct net *net, 712 + unsigned short port, int l3mdev, const struct sock *sk) 713 + { 714 + struct inet_bind2_bucket *bhash2 = NULL; 715 + 716 + inet_bind_bucket_for_each(bhash2, &head->chain) 717 + if (inet_bind2_bucket_match(bhash2, net, port, l3mdev, sk)) 718 + break; 719 + 720 + return bhash2; 721 + } 722 + 723 + struct inet_bind_hashbucket * 724 + inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, int port) 725 + { 726 + struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo; 727 + u32 hash; 728 + #if IS_ENABLED(CONFIG_IPV6) 729 + struct in6_addr addr_any = {}; 730 + 731 + if (sk->sk_family == AF_INET6) 732 + hash = ipv6_portaddr_hash(net, &addr_any, port); 733 + else 734 + #endif 735 + hash = ipv4_portaddr_hash(net, 0, port); 736 + 737 + return &hinfo->bhash2[hash & (hinfo->bhash_size - 1)]; 738 + } 739 + 740 + int inet_bhash2_update_saddr(struct inet_bind_hashbucket *prev_saddr, struct sock *sk) 741 + { 742 + struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo; 743 + struct inet_bind2_bucket *tb2, *new_tb2; 744 + int l3mdev = inet_sk_bound_l3mdev(sk); 745 + struct inet_bind_hashbucket *head2; 746 + int port = inet_sk(sk)->inet_num; 747 + struct net *net = sock_net(sk); 748 + 749 + /* Allocate a bind2 bucket ahead of time to avoid permanently putting 750 + * the bhash2 table in an inconsistent state if a new tb2 bucket 751 + * allocation fails. 752 + */ 753 + new_tb2 = kmem_cache_alloc(hinfo->bind2_bucket_cachep, GFP_ATOMIC); 754 + if (!new_tb2) 755 + return -ENOMEM; 756 + 757 + head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); 758 + 759 + if (prev_saddr) { 760 + spin_lock_bh(&prev_saddr->lock); 761 + __sk_del_bind2_node(sk); 762 + inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, 763 + inet_csk(sk)->icsk_bind2_hash); 764 + spin_unlock_bh(&prev_saddr->lock); 765 + } 766 + 767 + spin_lock_bh(&head2->lock); 768 + tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk); 769 + if (!tb2) { 770 + tb2 = new_tb2; 771 + inet_bind2_bucket_init(tb2, net, head2, port, l3mdev, sk); 772 + } 773 + sk_add_bind2_node(sk, &tb2->owners); 774 + inet_csk(sk)->icsk_bind2_hash = tb2; 775 + spin_unlock_bh(&head2->lock); 776 + 777 + if (tb2 != new_tb2) 778 + kmem_cache_free(hinfo->bind2_bucket_cachep, new_tb2); 779 + 780 + return 0; 781 + } 782 + EXPORT_SYMBOL_GPL(inet_bhash2_update_saddr); 783 + 788 784 /* RFC 6056 3.3.4. Algorithm 4: Double-Hash Port Selection Algorithm 789 785 * Note that we use 32bit integers (vs RFC 'short integers') 790 786 * because 2^16 is not a multiple of num_ephemeral and this ··· 910 694 struct sock *, __u16, struct inet_timewait_sock **)) 911 695 { 912 696 struct inet_hashinfo *hinfo = death_row->hashinfo; 697 + struct inet_bind_hashbucket *head, *head2; 913 698 struct inet_timewait_sock *tw = NULL; 914 - struct inet_bind_hashbucket *head; 915 699 int port = inet_sk(sk)->inet_num; 916 700 struct net *net = sock_net(sk); 701 + struct inet_bind2_bucket *tb2; 917 702 struct inet_bind_bucket *tb; 703 + bool tb_created = false; 918 704 u32 remaining, offset; 919 705 int ret, i, low, high; 920 706 int l3mdev; ··· 973 755 * the established check is already unique enough. 974 756 */ 975 757 inet_bind_bucket_for_each(tb, &head->chain) { 976 - if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev && 977 - tb->port == port) { 758 + if (inet_bind_bucket_match(tb, net, port, l3mdev)) { 978 759 if (tb->fastreuse >= 0 || 979 760 tb->fastreuseport >= 0) 980 761 goto next_port; ··· 991 774 spin_unlock_bh(&head->lock); 992 775 return -ENOMEM; 993 776 } 777 + tb_created = true; 994 778 tb->fastreuse = -1; 995 779 tb->fastreuseport = -1; 996 780 goto ok; ··· 1007 789 return -EADDRNOTAVAIL; 1008 790 1009 791 ok: 792 + /* Find the corresponding tb2 bucket since we need to 793 + * add the socket to the bhash2 table as well 794 + */ 795 + head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); 796 + spin_lock(&head2->lock); 797 + 798 + tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk); 799 + if (!tb2) { 800 + tb2 = inet_bind2_bucket_create(hinfo->bind2_bucket_cachep, net, 801 + head2, port, l3mdev, sk); 802 + if (!tb2) 803 + goto error; 804 + } 805 + 1010 806 /* Here we want to add a little bit of randomness to the next source 1011 807 * port that will be chosen. We use a max() with a random here so that 1012 808 * on low contention the randomness is maximal and on high contention ··· 1030 798 WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + 2); 1031 799 1032 800 /* Head lock still held and bh's disabled */ 1033 - inet_bind_hash(sk, tb, port); 801 + inet_bind_hash(sk, tb, tb2, port); 802 + 803 + spin_unlock(&head2->lock); 804 + 1034 805 if (sk_unhashed(sk)) { 1035 806 inet_sk(sk)->inet_sport = htons(port); 1036 807 inet_ehash_nolisten(sk, (struct sock *)tw, NULL); ··· 1045 810 inet_twsk_deschedule_put(tw); 1046 811 local_bh_enable(); 1047 812 return 0; 813 + 814 + error: 815 + spin_unlock(&head2->lock); 816 + if (tb_created) 817 + inet_bind_bucket_destroy(hinfo->bind_bucket_cachep, tb); 818 + spin_unlock_bh(&head->lock); 819 + return -ENOMEM; 1048 820 } 1049 821 1050 822 /*
+10 -1
net/ipv4/tcp.c
··· 4742 4742 SLAB_HWCACHE_ALIGN | SLAB_PANIC | 4743 4743 SLAB_ACCOUNT, 4744 4744 NULL); 4745 + tcp_hashinfo.bind2_bucket_cachep = 4746 + kmem_cache_create("tcp_bind2_bucket", 4747 + sizeof(struct inet_bind2_bucket), 0, 4748 + SLAB_HWCACHE_ALIGN | SLAB_PANIC | 4749 + SLAB_ACCOUNT, 4750 + NULL); 4745 4751 4746 4752 /* Size and allocate the main established and bind bucket 4747 4753 * hash tables. ··· 4771 4765 panic("TCP: failed to alloc ehash_locks"); 4772 4766 tcp_hashinfo.bhash = 4773 4767 alloc_large_system_hash("TCP bind", 4774 - sizeof(struct inet_bind_hashbucket), 4768 + 2 * sizeof(struct inet_bind_hashbucket), 4775 4769 tcp_hashinfo.ehash_mask + 1, 4776 4770 17, /* one slot per 128 KB of memory */ 4777 4771 0, ··· 4780 4774 0, 4781 4775 64 * 1024); 4782 4776 tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size; 4777 + tcp_hashinfo.bhash2 = tcp_hashinfo.bhash + tcp_hashinfo.bhash_size; 4783 4778 for (i = 0; i < tcp_hashinfo.bhash_size; i++) { 4784 4779 spin_lock_init(&tcp_hashinfo.bhash[i].lock); 4785 4780 INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain); 4781 + spin_lock_init(&tcp_hashinfo.bhash2[i].lock); 4782 + INIT_HLIST_HEAD(&tcp_hashinfo.bhash2[i].chain); 4786 4783 } 4787 4784 4788 4785
+21 -2
net/ipv4/tcp_ipv4.c
··· 199 199 /* This will initiate an outgoing connection. */ 200 200 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) 201 201 { 202 + struct inet_bind_hashbucket *prev_addr_hashbucket = NULL; 202 203 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; 204 + __be32 daddr, nexthop, prev_sk_rcv_saddr; 203 205 struct inet_sock *inet = inet_sk(sk); 204 206 struct tcp_sock *tp = tcp_sk(sk); 205 207 __be16 orig_sport, orig_dport; 206 - __be32 daddr, nexthop; 207 208 struct flowi4 *fl4; 208 209 struct rtable *rt; 209 210 int err; ··· 247 246 if (!inet_opt || !inet_opt->opt.srr) 248 247 daddr = fl4->daddr; 249 248 250 - if (!inet->inet_saddr) 249 + if (!inet->inet_saddr) { 250 + if (inet_csk(sk)->icsk_bind2_hash) { 251 + prev_addr_hashbucket = inet_bhashfn_portaddr(&tcp_hashinfo, 252 + sk, sock_net(sk), 253 + inet->inet_num); 254 + prev_sk_rcv_saddr = sk->sk_rcv_saddr; 255 + } 251 256 inet->inet_saddr = fl4->saddr; 257 + } 258 + 252 259 sk_rcv_saddr_set(sk, inet->inet_saddr); 260 + 261 + if (prev_addr_hashbucket) { 262 + err = inet_bhash2_update_saddr(prev_addr_hashbucket, sk); 263 + if (err) { 264 + inet->inet_saddr = 0; 265 + sk_rcv_saddr_set(sk, prev_sk_rcv_saddr); 266 + ip_rt_put(rt); 267 + return err; 268 + } 269 + } 253 270 254 271 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { 255 272 /* Reset inherited state */
+17
net/ipv6/tcp_ipv6.c
··· 287 287 } 288 288 289 289 if (!saddr) { 290 + struct inet_bind_hashbucket *prev_addr_hashbucket = NULL; 291 + struct in6_addr prev_v6_rcv_saddr; 292 + 293 + if (icsk->icsk_bind2_hash) { 294 + prev_addr_hashbucket = inet_bhashfn_portaddr(&tcp_hashinfo, 295 + sk, sock_net(sk), 296 + inet->inet_num); 297 + prev_v6_rcv_saddr = sk->sk_v6_rcv_saddr; 298 + } 290 299 saddr = &fl6.saddr; 291 300 sk->sk_v6_rcv_saddr = *saddr; 301 + 302 + if (prev_addr_hashbucket) { 303 + err = inet_bhash2_update_saddr(prev_addr_hashbucket, sk); 304 + if (err) { 305 + sk->sk_v6_rcv_saddr = prev_v6_rcv_saddr; 306 + goto failure; 307 + } 308 + } 292 309 } 293 310 294 311 /* set the source address */