Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'tcp-add-fast-path-in-timer-handlers'

Eric Dumazet says:

====================
tcp: add fast path in timer handlers

As mentioned in Netconf 2024:

TCP retransmit and delack timers are not stopped from
inet_csk_clear_xmit_timer() because we do not define
INET_CSK_CLEAR_TIMERS.

Enabling INET_CSK_CLEAR_TIMERS leads to lower performance,
mainly because del_timer() and mod_timer() happen from
different cpus quite often.

What we can do instead is to add fast paths to tcp_write_timer()
and tcp_delack_timer() to avoid socket spinlock acquisition.
====================

Link: https://patch.msgid.link/20241002173042.917928-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

+49 -24
+5 -4
include/net/inet_connection_sock.h
··· 197 197 struct inet_connection_sock *icsk = inet_csk(sk); 198 198 199 199 if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0) { 200 - icsk->icsk_pending = 0; 200 + smp_store_release(&icsk->icsk_pending, 0); 201 201 #ifdef INET_CSK_CLEAR_TIMERS 202 202 sk_stop_timer(sk, &icsk->icsk_retransmit_timer); 203 203 #endif 204 204 } else if (what == ICSK_TIME_DACK) { 205 - icsk->icsk_ack.pending = 0; 205 + smp_store_release(&icsk->icsk_ack.pending, 0); 206 206 icsk->icsk_ack.retry = 0; 207 207 #ifdef INET_CSK_CLEAR_TIMERS 208 208 sk_stop_timer(sk, &icsk->icsk_delack_timer); ··· 229 229 230 230 if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0 || 231 231 what == ICSK_TIME_LOSS_PROBE || what == ICSK_TIME_REO_TIMEOUT) { 232 - icsk->icsk_pending = what; 232 + smp_store_release(&icsk->icsk_pending, what); 233 233 icsk->icsk_timeout = jiffies + when; 234 234 sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout); 235 235 } else if (what == ICSK_TIME_DACK) { 236 - icsk->icsk_ack.pending |= ICSK_ACK_TIMER; 236 + smp_store_release(&icsk->icsk_ack.pending, 237 + icsk->icsk_ack.pending | ICSK_ACK_TIMER); 237 238 icsk->icsk_ack.timeout = jiffies + when; 238 239 sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout); 239 240 } else {
+4 -2
net/ipv4/inet_connection_sock.c
··· 775 775 { 776 776 struct inet_connection_sock *icsk = inet_csk(sk); 777 777 778 - icsk->icsk_pending = icsk->icsk_ack.pending = 0; 778 + smp_store_release(&icsk->icsk_pending, 0); 779 + smp_store_release(&icsk->icsk_ack.pending, 0); 779 780 780 781 sk_stop_timer(sk, &icsk->icsk_retransmit_timer); 781 782 sk_stop_timer(sk, &icsk->icsk_delack_timer); ··· 791 790 /* ongoing timer handlers need to acquire socket lock. */ 792 791 sock_not_owned_by_me(sk); 793 792 794 - icsk->icsk_pending = icsk->icsk_ack.pending = 0; 793 + smp_store_release(&icsk->icsk_pending, 0); 794 + smp_store_release(&icsk->icsk_ack.pending, 0); 795 795 796 796 sk_stop_timer_sync(sk, &icsk->icsk_retransmit_timer); 797 797 sk_stop_timer_sync(sk, &icsk->icsk_delack_timer);
+6 -4
net/ipv4/inet_diag.c
··· 247 247 struct nlmsghdr *nlh; 248 248 struct nlattr *attr; 249 249 void *info = NULL; 250 + u8 icsk_pending; 250 251 int protocol; 251 252 252 253 cb_data = cb->data; ··· 308 307 goto out; 309 308 } 310 309 311 - if (icsk->icsk_pending == ICSK_TIME_RETRANS || 312 - icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || 313 - icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { 310 + icsk_pending = smp_load_acquire(&icsk->icsk_pending); 311 + if (icsk_pending == ICSK_TIME_RETRANS || 312 + icsk_pending == ICSK_TIME_REO_TIMEOUT || 313 + icsk_pending == ICSK_TIME_LOSS_PROBE) { 314 314 r->idiag_timer = 1; 315 315 r->idiag_retrans = icsk->icsk_retransmits; 316 316 r->idiag_expires = 317 317 jiffies_delta_to_msecs(icsk->icsk_timeout - jiffies); 318 - } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { 318 + } else if (icsk_pending == ICSK_TIME_PROBE0) { 319 319 r->idiag_timer = 4; 320 320 r->idiag_retrans = icsk->icsk_probes_out; 321 321 r->idiag_expires =
+6 -4
net/ipv4/tcp_ipv4.c
··· 2900 2900 __be32 src = inet->inet_rcv_saddr; 2901 2901 __u16 destp = ntohs(inet->inet_dport); 2902 2902 __u16 srcp = ntohs(inet->inet_sport); 2903 + u8 icsk_pending; 2903 2904 int rx_queue; 2904 2905 int state; 2905 2906 2906 - if (icsk->icsk_pending == ICSK_TIME_RETRANS || 2907 - icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || 2908 - icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { 2907 + icsk_pending = smp_load_acquire(&icsk->icsk_pending); 2908 + if (icsk_pending == ICSK_TIME_RETRANS || 2909 + icsk_pending == ICSK_TIME_REO_TIMEOUT || 2910 + icsk_pending == ICSK_TIME_LOSS_PROBE) { 2909 2911 timer_active = 1; 2910 2912 timer_expires = icsk->icsk_timeout; 2911 - } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { 2913 + } else if (icsk_pending == ICSK_TIME_PROBE0) { 2912 2914 timer_active = 4; 2913 2915 timer_expires = icsk->icsk_timeout; 2914 2916 } else if (timer_pending(&sk->sk_timer)) {
+4 -3
net/ipv4/tcp_output.c
··· 2960 2960 WARN_ONCE(tp->packets_out, 2961 2961 "invalid inflight: %u state %u cwnd %u mss %d\n", 2962 2962 tp->packets_out, sk->sk_state, tcp_snd_cwnd(tp), mss); 2963 - inet_csk(sk)->icsk_pending = 0; 2963 + smp_store_release(&inet_csk(sk)->icsk_pending, 0); 2964 2964 return; 2965 2965 } 2966 2966 ··· 2993 2993 2994 2994 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSPROBES); 2995 2995 /* Reset s.t. tcp_rearm_rto will restart timer from now */ 2996 - inet_csk(sk)->icsk_pending = 0; 2996 + smp_store_release(&inet_csk(sk)->icsk_pending, 0); 2997 2997 rearm_timer: 2998 2998 tcp_rearm_rto(sk); 2999 2999 } ··· 4224 4224 if (!time_before(timeout, icsk->icsk_ack.timeout)) 4225 4225 timeout = icsk->icsk_ack.timeout; 4226 4226 } 4227 - icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER; 4227 + smp_store_release(&icsk->icsk_ack.pending, 4228 + icsk->icsk_ack.pending | ICSK_ACK_SCHED | ICSK_ACK_TIMER); 4228 4229 icsk->icsk_ack.timeout = timeout; 4229 4230 sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout); 4230 4231 }
+16 -2
net/ipv4/tcp_timer.c
··· 361 361 from_timer(icsk, t, icsk_delack_timer); 362 362 struct sock *sk = &icsk->icsk_inet.sk; 363 363 364 + /* Avoid taking socket spinlock if there is no ACK to send. 365 + * The compressed_ack check is racy, but a separate hrtimer 366 + * will take care of it eventually. 367 + */ 368 + if (!(smp_load_acquire(&icsk->icsk_ack.pending) & ICSK_ACK_TIMER) && 369 + !READ_ONCE(tcp_sk(sk)->compressed_ack)) 370 + goto out; 371 + 364 372 bh_lock_sock(sk); 365 373 if (!sock_owned_by_user(sk)) { 366 374 tcp_delack_timer_handler(sk); ··· 379 371 sock_hold(sk); 380 372 } 381 373 bh_unlock_sock(sk); 374 + out: 382 375 sock_put(sk); 383 376 } 384 377 ··· 710 701 tcp_send_loss_probe(sk); 711 702 break; 712 703 case ICSK_TIME_RETRANS: 713 - icsk->icsk_pending = 0; 704 + smp_store_release(&icsk->icsk_pending, 0); 714 705 tcp_retransmit_timer(sk); 715 706 break; 716 707 case ICSK_TIME_PROBE0: 717 - icsk->icsk_pending = 0; 708 + smp_store_release(&icsk->icsk_pending, 0); 718 709 tcp_probe_timer(sk); 719 710 break; 720 711 } ··· 726 717 from_timer(icsk, t, icsk_retransmit_timer); 727 718 struct sock *sk = &icsk->icsk_inet.sk; 728 719 720 + /* Avoid locking the socket when there is no pending event. */ 721 + if (!smp_load_acquire(&icsk->icsk_pending)) 722 + goto out; 723 + 729 724 bh_lock_sock(sk); 730 725 if (!sock_owned_by_user(sk)) { 731 726 tcp_write_timer_handler(sk); ··· 739 726 sock_hold(sk); 740 727 } 741 728 bh_unlock_sock(sk); 729 + out: 742 730 sock_put(sk); 743 731 } 744 732
+6 -4
net/ipv6/tcp_ipv6.c
··· 2177 2177 const struct tcp_sock *tp = tcp_sk(sp); 2178 2178 const struct inet_connection_sock *icsk = inet_csk(sp); 2179 2179 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; 2180 + u8 icsk_pending; 2180 2181 int rx_queue; 2181 2182 int state; 2182 2183 ··· 2186 2185 destp = ntohs(inet->inet_dport); 2187 2186 srcp = ntohs(inet->inet_sport); 2188 2187 2189 - if (icsk->icsk_pending == ICSK_TIME_RETRANS || 2190 - icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || 2191 - icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { 2188 + icsk_pending = smp_load_acquire(&icsk->icsk_pending); 2189 + if (icsk_pending == ICSK_TIME_RETRANS || 2190 + icsk_pending == ICSK_TIME_REO_TIMEOUT || 2191 + icsk_pending == ICSK_TIME_LOSS_PROBE) { 2192 2192 timer_active = 1; 2193 2193 timer_expires = icsk->icsk_timeout; 2194 - } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { 2194 + } else if (icsk_pending == ICSK_TIME_PROBE0) { 2195 2195 timer_active = 4; 2196 2196 timer_expires = icsk->icsk_timeout; 2197 2197 } else if (timer_pending(&sp->sk_timer)) {
+2 -1
net/mptcp/protocol.c
··· 3504 3504 timeout += jiffies; 3505 3505 3506 3506 WARN_ON_ONCE(icsk->icsk_ack.pending & ICSK_ACK_TIMER); 3507 - icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER; 3507 + smp_store_release(&icsk->icsk_ack.pending, 3508 + icsk->icsk_ack.pending | ICSK_ACK_SCHED | ICSK_ACK_TIMER); 3508 3509 icsk->icsk_ack.timeout = timeout; 3509 3510 sk_reset_timer(ssk, &icsk->icsk_delack_timer, timeout); 3510 3511 }