Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'listener_refactor_part_12'

Eric Dumazet says:

====================
inet: tcp listener refactoring, part 12

By adding a pointer back to listener, we are preparing synack rtx
handling to no longer be governed by listener keepalive timer,
as this is the most problematic source of contention on listener
spinlock. Note that TCP FastOpen had such pointer anyway, so we
make it generic.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>

+76 -72
+1 -1
include/linux/tcp.h
··· 111 111 struct tcp_request_sock { 112 112 struct inet_request_sock req; 113 113 const struct tcp_request_sock_ops *af_specific; 114 - struct sock *listener; /* needed for TFO */ 114 + bool tfo_listener; 115 115 u32 rcv_isn; 116 116 u32 snt_isn; 117 117 u32 snt_synack; /* synack sent time */
-5
include/net/inet_connection_sock.h
··· 275 275 struct sock *child) 276 276 { 277 277 reqsk_queue_add(&inet_csk(sk)->icsk_accept_queue, req, sk, child); 278 - /* before letting lookups find us, make sure all req fields 279 - * are committed to memory. 280 - */ 281 - smp_wmb(); 282 - atomic_set(&req->rsk_refcnt, 1); 283 278 } 284 279 285 280 void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
+2 -20
include/net/inet_sock.h
··· 81 81 #define ir_cookie req.__req_common.skc_cookie 82 82 #define ireq_net req.__req_common.skc_net 83 83 #define ireq_state req.__req_common.skc_state 84 - #define ireq_refcnt req.__req_common.skc_refcnt 85 84 #define ireq_family req.__req_common.skc_family 86 85 87 86 kmemcheck_bitfield_begin(flags); ··· 243 244 initval); 244 245 } 245 246 246 - static inline struct request_sock *inet_reqsk_alloc(struct request_sock_ops *ops) 247 - { 248 - struct request_sock *req = reqsk_alloc(ops); 249 - struct inet_request_sock *ireq = inet_rsk(req); 250 - 251 - if (req != NULL) { 252 - kmemcheck_annotate_bitfield(ireq, flags); 253 - ireq->opt = NULL; 254 - atomic64_set(&ireq->ir_cookie, 0); 255 - ireq->ireq_state = TCP_NEW_SYN_RECV; 256 - 257 - /* Following is temporary. It is coupled with debugging 258 - * helpers in reqsk_put() & reqsk_free() 259 - */ 260 - atomic_set(&ireq->ireq_refcnt, 0); 261 - } 262 - 263 - return req; 264 - } 247 + struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops, 248 + struct sock *sk_listener); 265 249 266 250 static inline __u8 inet_sk_flowi_flags(const struct sock *sk) 267 251 {
+19 -2
include/net/request_sock.h
··· 52 52 #define rsk_refcnt __req_common.skc_refcnt 53 53 54 54 struct request_sock *dl_next; 55 + struct sock *rsk_listener; 55 56 u16 mss; 56 57 u8 num_retrans; /* number of retransmits */ 57 58 u8 cookie_ts:1; /* syncookie: encode tcpopts in timestamp */ ··· 68 67 u32 peer_secid; 69 68 }; 70 69 71 - static inline struct request_sock *reqsk_alloc(const struct request_sock_ops *ops) 70 + static inline struct request_sock * 71 + reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener) 72 72 { 73 73 struct request_sock *req = kmem_cache_alloc(ops->slab, GFP_ATOMIC); 74 74 75 - if (req != NULL) 75 + if (req) { 76 76 req->rsk_ops = ops; 77 + sock_hold(sk_listener); 78 + req->rsk_listener = sk_listener; 77 79 80 + /* Following is temporary. It is coupled with debugging 81 + * helpers in reqsk_put() & reqsk_free() 82 + */ 83 + atomic_set(&req->rsk_refcnt, 0); 84 + } 78 85 return req; 79 86 } 80 87 ··· 97 88 WARN_ON_ONCE(atomic_read(&req->rsk_refcnt) != 0); 98 89 99 90 req->rsk_ops->destructor(req); 91 + if (req->rsk_listener) 92 + sock_put(req->rsk_listener); 100 93 kmem_cache_free(req->rsk_ops->slab, req); 101 94 } 102 95 ··· 296 285 req->num_timeout = 0; 297 286 req->sk = NULL; 298 287 req->dl_next = lopt->syn_table[hash]; 288 + 289 + /* before letting lookups find us, make sure all req fields 290 + * are committed to memory and refcnt initialized. 291 + */ 292 + smp_wmb(); 293 + atomic_set(&req->rsk_refcnt, 1); 299 294 300 295 write_lock(&queue->syn_wait_lock); 301 296 lopt->syn_table[hash] = req;
+7 -11
net/core/request_sock.c
··· 153 153 * case might also exist in tcp_v4_hnd_req() that will trigger this locking 154 154 * order. 155 155 * 156 - * When a TFO req is created, it needs to sock_hold its listener to prevent 157 - * the latter data structure from going away. 158 - * 159 - * This function also sets "treq->listener" to NULL and unreference listener 160 - * socket. treq->listener is used by the listener so it is protected by the 156 + * This function also sets "treq->tfo_listener" to false. 157 + * treq->tfo_listener is used by the listener so it is protected by the 161 158 * fastopenq->lock in this function. 162 159 */ 163 160 void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req, 164 161 bool reset) 165 162 { 166 - struct sock *lsk = tcp_rsk(req)->listener; 167 - struct fastopen_queue *fastopenq = 168 - inet_csk(lsk)->icsk_accept_queue.fastopenq; 163 + struct sock *lsk = req->rsk_listener; 164 + struct fastopen_queue *fastopenq; 165 + 166 + fastopenq = inet_csk(lsk)->icsk_accept_queue.fastopenq; 169 167 170 168 tcp_sk(sk)->fastopen_rsk = NULL; 171 169 spin_lock_bh(&fastopenq->lock); 172 170 fastopenq->qlen--; 173 - tcp_rsk(req)->listener = NULL; 171 + tcp_rsk(req)->tfo_listener = false; 174 172 if (req->sk) /* the child socket hasn't been accepted yet */ 175 173 goto out; 176 174 ··· 177 179 * special RST handling below. 178 180 */ 179 181 spin_unlock_bh(&fastopenq->lock); 180 - sock_put(lsk); 181 182 reqsk_put(req); 182 183 return; 183 184 } ··· 198 201 fastopenq->qlen++; 199 202 out: 200 203 spin_unlock_bh(&fastopenq->lock); 201 - sock_put(lsk); 202 204 }
+1 -2
net/dccp/ipv4.c
··· 624 624 if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) 625 625 goto drop; 626 626 627 - req = inet_reqsk_alloc(&dccp_request_sock_ops); 627 + req = inet_reqsk_alloc(&dccp_request_sock_ops, sk); 628 628 if (req == NULL) 629 629 goto drop; 630 630 ··· 641 641 ireq = inet_rsk(req); 642 642 ireq->ir_loc_addr = ip_hdr(skb)->daddr; 643 643 ireq->ir_rmt_addr = ip_hdr(skb)->saddr; 644 - write_pnet(&ireq->ireq_net, sock_net(sk)); 645 644 ireq->ireq_family = AF_INET; 646 645 ireq->ir_iif = sk->sk_bound_dev_if; 647 646
+1 -2
net/dccp/ipv6.c
··· 386 386 if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) 387 387 goto drop; 388 388 389 - req = inet_reqsk_alloc(&dccp6_request_sock_ops); 389 + req = inet_reqsk_alloc(&dccp6_request_sock_ops, sk); 390 390 if (req == NULL) 391 391 goto drop; 392 392 ··· 403 403 ireq = inet_rsk(req); 404 404 ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr; 405 405 ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr; 406 - write_pnet(&ireq->ireq_net, sock_net(sk)); 407 406 ireq->ireq_family = AF_INET6; 408 407 409 408 if (ipv6_opt_accepted(sk, skb, IP6CB(skb)) ||
+7 -6
net/ipv4/inet_connection_sock.c
··· 293 293 { 294 294 struct inet_connection_sock *icsk = inet_csk(sk); 295 295 struct request_sock_queue *queue = &icsk->icsk_accept_queue; 296 - struct sock *newsk; 297 296 struct request_sock *req; 297 + struct sock *newsk; 298 298 int error; 299 299 300 300 lock_sock(sk); ··· 323 323 newsk = req->sk; 324 324 325 325 sk_acceptq_removed(sk); 326 - if (sk->sk_protocol == IPPROTO_TCP && queue->fastopenq != NULL) { 326 + if (sk->sk_protocol == IPPROTO_TCP && 327 + tcp_rsk(req)->tfo_listener && 328 + queue->fastopenq) { 327 329 spin_lock_bh(&queue->fastopenq->lock); 328 - if (tcp_rsk(req)->listener) { 330 + if (tcp_rsk(req)->tfo_listener) { 329 331 /* We are still waiting for the final ACK from 3WHS 330 332 * so can't free req now. Instead, we set req->sk to 331 333 * NULL to signify that the child socket is taken ··· 819 817 820 818 percpu_counter_inc(sk->sk_prot->orphan_count); 821 819 822 - if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->listener) { 820 + if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) { 823 821 BUG_ON(tcp_sk(child)->fastopen_rsk != req); 824 - BUG_ON(sk != tcp_rsk(req)->listener); 822 + BUG_ON(sk != req->rsk_listener); 825 823 826 824 /* Paranoid, to prevent race condition if 827 825 * an inbound pkt destined for child is ··· 830 828 * tcp_v4_destroy_sock(). 831 829 */ 832 830 tcp_sk(child)->fastopen_rsk = NULL; 833 - sock_put(sk); 834 831 } 835 832 inet_csk_destroy_sock(child); 836 833
+8 -8
net/ipv4/syncookies.c
··· 227 227 struct sock *child; 228 228 229 229 child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst); 230 - if (child) 230 + if (child) { 231 + atomic_set(&req->rsk_refcnt, 1); 231 232 inet_csk_reqsk_queue_add(sk, req, child); 232 - else 233 + } else { 233 234 reqsk_free(req); 234 - 235 + } 235 236 return child; 236 237 } 237 238 ··· 326 325 goto out; 327 326 328 327 ret = NULL; 329 - req = inet_reqsk_alloc(&tcp_request_sock_ops); /* for safety */ 328 + req = inet_reqsk_alloc(&tcp_request_sock_ops, sk); /* for safety */ 330 329 if (!req) 331 330 goto out; 332 331 ··· 346 345 ireq->tstamp_ok = tcp_opt.saw_tstamp; 347 346 req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0; 348 347 treq->snt_synack = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0; 349 - treq->listener = NULL; 350 - write_pnet(&ireq->ireq_net, sock_net(sk)); 348 + treq->tfo_listener = false; 351 349 ireq->ireq_family = AF_INET; 352 350 353 351 ireq->ir_iif = sk->sk_bound_dev_if; ··· 357 357 ireq->opt = tcp_v4_save_options(skb); 358 358 359 359 if (security_inet_conn_request(sk, skb, req)) { 360 - reqsk_put(req); 360 + reqsk_free(req); 361 361 goto out; 362 362 } 363 363 ··· 378 378 security_req_classify_flow(req, flowi4_to_flowi(&fl4)); 379 379 rt = ip_route_output_key(sock_net(sk), &fl4); 380 380 if (IS_ERR(rt)) { 381 - reqsk_put(req); 381 + reqsk_free(req); 382 382 goto out; 383 383 } 384 384
+2 -6
net/ipv4/tcp_fastopen.c
··· 155 155 tp = tcp_sk(child); 156 156 157 157 tp->fastopen_rsk = req; 158 - /* Do a hold on the listner sk so that if the listener is being 159 - * closed, the child that has been accepted can live on and still 160 - * access listen_lock. 161 - */ 162 - sock_hold(sk); 163 - tcp_rsk(req)->listener = sk; 158 + tcp_rsk(req)->tfo_listener = true; 164 159 165 160 /* RFC1323: The window in SYN & SYN/ACK segments is never 166 161 * scaled. So correct it appropriately. ··· 169 174 inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS, 170 175 TCP_TIMEOUT_INIT, TCP_RTO_MAX); 171 176 177 + atomic_set(&req->rsk_refcnt, 1); 172 178 /* Add the child socket directly into the accept queue */ 173 179 inet_csk_reqsk_queue_add(sk, req, child); 174 180
+22 -3
net/ipv4/tcp_input.c
··· 5967 5967 ireq->ir_mark = inet_request_mark(sk, skb); 5968 5968 } 5969 5969 5970 + struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops, 5971 + struct sock *sk_listener) 5972 + { 5973 + struct request_sock *req = reqsk_alloc(ops, sk_listener); 5974 + 5975 + if (req) { 5976 + struct inet_request_sock *ireq = inet_rsk(req); 5977 + 5978 + kmemcheck_annotate_bitfield(ireq, flags); 5979 + ireq->opt = NULL; 5980 + atomic64_set(&ireq->ir_cookie, 0); 5981 + ireq->ireq_state = TCP_NEW_SYN_RECV; 5982 + write_pnet(&ireq->ireq_net, sock_net(sk_listener)); 5983 + 5984 + } 5985 + 5986 + return req; 5987 + } 5988 + EXPORT_SYMBOL(inet_reqsk_alloc); 5989 + 5970 5990 int tcp_conn_request(struct request_sock_ops *rsk_ops, 5971 5991 const struct tcp_request_sock_ops *af_ops, 5972 5992 struct sock *sk, struct sk_buff *skb) ··· 6024 6004 goto drop; 6025 6005 } 6026 6006 6027 - req = inet_reqsk_alloc(rsk_ops); 6007 + req = inet_reqsk_alloc(rsk_ops, sk); 6028 6008 if (!req) 6029 6009 goto drop; 6030 6010 ··· 6040 6020 6041 6021 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; 6042 6022 tcp_openreq_init(req, &tmp_opt, skb, sk); 6043 - write_pnet(&inet_rsk(req)->ireq_net, sock_net(sk)); 6044 6023 6045 6024 /* Note: tcp_v6_init_req() might override ir_iif for link locals */ 6046 6025 inet_rsk(req)->ir_iif = sk->sk_bound_dev_if; ··· 6116 6097 if (err || want_cookie) 6117 6098 goto drop_and_free; 6118 6099 6119 - tcp_rsk(req)->listener = NULL; 6100 + tcp_rsk(req)->tfo_listener = false; 6120 6101 af_ops->queue_hash_add(sk, req, TCP_TIMEOUT_INIT); 6121 6102 } 6122 6103
+6 -6
net/ipv6/syncookies.c
··· 49 49 struct sock *child; 50 50 51 51 child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst); 52 - if (child) 52 + if (child) { 53 + atomic_set(&req->rsk_refcnt, 1); 53 54 inet_csk_reqsk_queue_add(sk, req, child); 54 - else 55 + } else { 55 56 reqsk_free(req); 56 - 57 + } 57 58 return child; 58 59 } 59 60 ··· 190 189 goto out; 191 190 192 191 ret = NULL; 193 - req = inet_reqsk_alloc(&tcp6_request_sock_ops); 192 + req = inet_reqsk_alloc(&tcp6_request_sock_ops, sk); 194 193 if (!req) 195 194 goto out; 196 195 197 196 ireq = inet_rsk(req); 198 197 treq = tcp_rsk(req); 199 - treq->listener = NULL; 200 - write_pnet(&ireq->ireq_net, sock_net(sk)); 198 + treq->tfo_listener = false; 201 199 ireq->ireq_family = AF_INET6; 202 200 203 201 if (security_inet_conn_request(sk, skb, req))