Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'rxrpc-fixes-20200520' of git://git.kernel.org/pub/scm/linux/kernel/git/dhowells/linux-fs

David Howells says:

====================
rxrpc: Fix retransmission timeout and ACK discard

Here are a couple of fixes and an extra tracepoint for AF_RXRPC:

(1) Calculate the RTO pretty much as TCP does, rather than making
something up, including an initial 4s timeout (which causes return
probes from the fileserver to fail if a packet goes missing), and add
backoff.

(2) Fix the discarding of out-of-order received ACKs. We mustn't let the
hard-ACK point regress, nor do we want to do unnecessary
retransmission because the soft-ACK list regresses. This is not
trivial, however, due to some loose wording in various old protocol
specs, the ACK field that should be used for this sometimes has the
wrong information in it.

(3) Add a tracepoint to log a discarded ACK.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>

+335 -159
+5 -13
fs/afs/fs_probe.c
··· 32 32 struct afs_server *server = call->server; 33 33 unsigned int server_index = call->server_index; 34 34 unsigned int index = call->addr_ix; 35 - unsigned int rtt = UINT_MAX; 35 + unsigned int rtt_us; 36 36 bool have_result = false; 37 - u64 _rtt; 38 37 int ret = call->error; 39 38 40 39 _enter("%pU,%u", &server->uuid, index); ··· 92 93 } 93 94 } 94 95 95 - /* Get the RTT and scale it to fit into a 32-bit value that represents 96 - * over a minute of time so that we can access it with one instruction 97 - * on a 32-bit system. 98 - */ 99 - _rtt = rxrpc_kernel_get_rtt(call->net->socket, call->rxcall); 100 - _rtt /= 64; 101 - rtt = (_rtt > UINT_MAX) ? UINT_MAX : _rtt; 102 - if (rtt < server->probe.rtt) { 103 - server->probe.rtt = rtt; 96 + rtt_us = rxrpc_kernel_get_srtt(call->net->socket, call->rxcall); 97 + if (rtt_us < server->probe.rtt) { 98 + server->probe.rtt = rtt_us; 104 99 alist->preferred = index; 105 100 have_result = true; 106 101 } ··· 106 113 spin_unlock(&server->probe_lock); 107 114 108 115 _debug("probe [%u][%u] %pISpc rtt=%u ret=%d", 109 - server_index, index, &alist->addrs[index].transport, 110 - (unsigned int)rtt, ret); 116 + server_index, index, &alist->addrs[index].transport, rtt_us, ret); 111 117 112 118 have_result |= afs_fs_probe_done(server); 113 119 if (have_result)
+5 -13
fs/afs/vl_probe.c
··· 31 31 struct afs_addr_list *alist = call->alist; 32 32 struct afs_vlserver *server = call->vlserver; 33 33 unsigned int server_index = call->server_index; 34 + unsigned int rtt_us = 0; 34 35 unsigned int index = call->addr_ix; 35 - unsigned int rtt = UINT_MAX; 36 36 bool have_result = false; 37 - u64 _rtt; 38 37 int ret = call->error; 39 38 40 39 _enter("%s,%u,%u,%d,%d", server->name, server_index, index, ret, call->abort_code); ··· 92 93 } 93 94 } 94 95 95 - /* Get the RTT and scale it to fit into a 32-bit value that represents 96 - * over a minute of time so that we can access it with one instruction 97 - * on a 32-bit system. 98 - */ 99 - _rtt = rxrpc_kernel_get_rtt(call->net->socket, call->rxcall); 100 - _rtt /= 64; 101 - rtt = (_rtt > UINT_MAX) ? UINT_MAX : _rtt; 102 - if (rtt < server->probe.rtt) { 103 - server->probe.rtt = rtt; 96 + rtt_us = rxrpc_kernel_get_srtt(call->net->socket, call->rxcall); 97 + if (rtt_us < server->probe.rtt) { 98 + server->probe.rtt = rtt_us; 104 99 alist->preferred = index; 105 100 have_result = true; 106 101 } ··· 106 113 spin_unlock(&server->probe_lock); 107 114 108 115 _debug("probe [%u][%u] %pISpc rtt=%u ret=%d", 109 - server_index, index, &alist->addrs[index].transport, 110 - (unsigned int)rtt, ret); 116 + server_index, index, &alist->addrs[index].transport, rtt_us, ret); 111 117 112 118 have_result |= afs_vl_probe_done(server); 113 119 if (have_result) {
+1 -1
include/net/af_rxrpc.h
··· 59 59 void rxrpc_kernel_end_call(struct socket *, struct rxrpc_call *); 60 60 void rxrpc_kernel_get_peer(struct socket *, struct rxrpc_call *, 61 61 struct sockaddr_rxrpc *); 62 - u64 rxrpc_kernel_get_rtt(struct socket *, struct rxrpc_call *); 62 + u32 rxrpc_kernel_get_srtt(struct socket *, struct rxrpc_call *); 63 63 int rxrpc_kernel_charge_accept(struct socket *, rxrpc_notify_rx_t, 64 64 rxrpc_user_attach_call_t, unsigned long, gfp_t, 65 65 unsigned int);
+42 -10
include/trace/events/rxrpc.h
··· 1112 1112 TRACE_EVENT(rxrpc_rtt_rx, 1113 1113 TP_PROTO(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why, 1114 1114 rxrpc_serial_t send_serial, rxrpc_serial_t resp_serial, 1115 - s64 rtt, u8 nr, s64 avg), 1115 + u32 rtt, u32 rto), 1116 1116 1117 - TP_ARGS(call, why, send_serial, resp_serial, rtt, nr, avg), 1117 + TP_ARGS(call, why, send_serial, resp_serial, rtt, rto), 1118 1118 1119 1119 TP_STRUCT__entry( 1120 1120 __field(unsigned int, call ) 1121 1121 __field(enum rxrpc_rtt_rx_trace, why ) 1122 - __field(u8, nr ) 1123 1122 __field(rxrpc_serial_t, send_serial ) 1124 1123 __field(rxrpc_serial_t, resp_serial ) 1125 - __field(s64, rtt ) 1126 - __field(u64, avg ) 1124 + __field(u32, rtt ) 1125 + __field(u32, rto ) 1127 1126 ), 1128 1127 1129 1128 TP_fast_assign( ··· 1131 1132 __entry->send_serial = send_serial; 1132 1133 __entry->resp_serial = resp_serial; 1133 1134 __entry->rtt = rtt; 1134 - __entry->nr = nr; 1135 - __entry->avg = avg; 1135 + __entry->rto = rto; 1136 1136 ), 1137 1137 1138 - TP_printk("c=%08x %s sr=%08x rr=%08x rtt=%lld nr=%u avg=%lld", 1138 + TP_printk("c=%08x %s sr=%08x rr=%08x rtt=%u rto=%u", 1139 1139 __entry->call, 1140 1140 __print_symbolic(__entry->why, rxrpc_rtt_rx_traces), 1141 1141 __entry->send_serial, 1142 1142 __entry->resp_serial, 1143 1143 __entry->rtt, 1144 - __entry->nr, 1145 - __entry->avg) 1144 + __entry->rto) 1146 1145 ); 1147 1146 1148 1147 TRACE_EVENT(rxrpc_timer, ··· 1539 1542 TP_printk("c=%08x r=%08x", 1540 1543 __entry->debug_id, 1541 1544 __entry->serial) 1545 + ); 1546 + 1547 + TRACE_EVENT(rxrpc_rx_discard_ack, 1548 + TP_PROTO(unsigned int debug_id, rxrpc_serial_t serial, 1549 + rxrpc_seq_t first_soft_ack, rxrpc_seq_t call_ackr_first, 1550 + rxrpc_seq_t prev_pkt, rxrpc_seq_t call_ackr_prev), 1551 + 1552 + TP_ARGS(debug_id, serial, first_soft_ack, call_ackr_first, 1553 + prev_pkt, call_ackr_prev), 1554 + 1555 + TP_STRUCT__entry( 1556 + __field(unsigned int, debug_id ) 1557 + __field(rxrpc_serial_t, serial ) 1558 + __field(rxrpc_seq_t, first_soft_ack) 1559 + __field(rxrpc_seq_t, call_ackr_first) 1560 + __field(rxrpc_seq_t, prev_pkt) 1561 + __field(rxrpc_seq_t, call_ackr_prev) 1562 + ), 1563 + 1564 + TP_fast_assign( 1565 + __entry->debug_id = debug_id; 1566 + __entry->serial = serial; 1567 + __entry->first_soft_ack = first_soft_ack; 1568 + __entry->call_ackr_first = call_ackr_first; 1569 + __entry->prev_pkt = prev_pkt; 1570 + __entry->call_ackr_prev = call_ackr_prev; 1571 + ), 1572 + 1573 + TP_printk("c=%08x r=%08x %08x<%08x %08x<%08x", 1574 + __entry->debug_id, 1575 + __entry->serial, 1576 + __entry->first_soft_ack, 1577 + __entry->call_ackr_first, 1578 + __entry->prev_pkt, 1579 + __entry->call_ackr_prev) 1542 1580 ); 1543 1581 1544 1582 #endif /* _TRACE_RXRPC_H */
+1
net/rxrpc/Makefile
··· 25 25 peer_event.o \ 26 26 peer_object.o \ 27 27 recvmsg.o \ 28 + rtt.o \ 28 29 security.o \ 29 30 sendmsg.o \ 30 31 skbuff.o \
+17 -8
net/rxrpc/ar-internal.h
··· 7 7 8 8 #include <linux/atomic.h> 9 9 #include <linux/seqlock.h> 10 + #include <linux/win_minmax.h> 10 11 #include <net/net_namespace.h> 11 12 #include <net/netns/generic.h> 12 13 #include <net/sock.h> ··· 312 311 #define RXRPC_RTT_CACHE_SIZE 32 313 312 spinlock_t rtt_input_lock; /* RTT lock for input routine */ 314 313 ktime_t rtt_last_req; /* Time of last RTT request */ 315 - u64 rtt; /* Current RTT estimate (in nS) */ 316 - u64 rtt_sum; /* Sum of cache contents */ 317 - u64 rtt_cache[RXRPC_RTT_CACHE_SIZE]; /* Determined RTT cache */ 318 - u8 rtt_cursor; /* next entry at which to insert */ 319 - u8 rtt_usage; /* amount of cache actually used */ 314 + unsigned int rtt_count; /* Number of samples we've got */ 315 + 316 + u32 srtt_us; /* smoothed round trip time << 3 in usecs */ 317 + u32 mdev_us; /* medium deviation */ 318 + u32 mdev_max_us; /* maximal mdev for the last rtt period */ 319 + u32 rttvar_us; /* smoothed mdev_max */ 320 + u32 rto_j; /* Retransmission timeout in jiffies */ 321 + u8 backoff; /* Backoff timeout */ 320 322 321 323 u8 cong_cwnd; /* Congestion window size */ 322 324 }; ··· 1045 1041 extern unsigned int rxrpc_rx_window_size; 1046 1042 extern unsigned int rxrpc_rx_mtu; 1047 1043 extern unsigned int rxrpc_rx_jumbo_max; 1048 - extern unsigned long rxrpc_resend_timeout; 1049 1044 1050 1045 extern const s8 rxrpc_ack_priority[]; 1051 1046 ··· 1072 1069 * peer_event.c 1073 1070 */ 1074 1071 void rxrpc_error_report(struct sock *); 1075 - void rxrpc_peer_add_rtt(struct rxrpc_call *, enum rxrpc_rtt_rx_trace, 1076 - rxrpc_serial_t, rxrpc_serial_t, ktime_t, ktime_t); 1077 1072 void rxrpc_peer_keepalive_worker(struct work_struct *); 1078 1073 1079 1074 /* ··· 1102 1101 */ 1103 1102 void rxrpc_notify_socket(struct rxrpc_call *); 1104 1103 int rxrpc_recvmsg(struct socket *, struct msghdr *, size_t, int); 1104 + 1105 + /* 1106 + * rtt.c 1107 + */ 1108 + void rxrpc_peer_add_rtt(struct rxrpc_call *, enum rxrpc_rtt_rx_trace, 1109 + rxrpc_serial_t, rxrpc_serial_t, ktime_t, ktime_t); 1110 + unsigned long rxrpc_get_rto_backoff(struct rxrpc_peer *, bool); 1111 + void rxrpc_peer_init_rtt(struct rxrpc_peer *); 1105 1112 1106 1113 /* 1107 1114 * rxkad.c
+1 -1
net/rxrpc/call_accept.c
··· 248 248 struct rxrpc_skb_priv *sp = rxrpc_skb(skb); 249 249 ktime_t now = skb->tstamp; 250 250 251 - if (call->peer->rtt_usage < 3 || 251 + if (call->peer->rtt_count < 3 || 252 252 ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), now)) 253 253 rxrpc_propose_ACK(call, RXRPC_ACK_PING, sp->hdr.serial, 254 254 true, true,
+8 -14
net/rxrpc/call_event.c
··· 111 111 } else { 112 112 unsigned long now = jiffies, ack_at; 113 113 114 - if (call->peer->rtt_usage > 0) 115 - ack_at = nsecs_to_jiffies(call->peer->rtt); 114 + if (call->peer->srtt_us != 0) 115 + ack_at = usecs_to_jiffies(call->peer->srtt_us >> 3); 116 116 else 117 117 ack_at = expiry; 118 118 ··· 157 157 static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j) 158 158 { 159 159 struct sk_buff *skb; 160 - unsigned long resend_at; 160 + unsigned long resend_at, rto_j; 161 161 rxrpc_seq_t cursor, seq, top; 162 - ktime_t now, max_age, oldest, ack_ts, timeout, min_timeo; 162 + ktime_t now, max_age, oldest, ack_ts; 163 163 int ix; 164 164 u8 annotation, anno_type, retrans = 0, unacked = 0; 165 165 166 166 _enter("{%d,%d}", call->tx_hard_ack, call->tx_top); 167 167 168 - if (call->peer->rtt_usage > 1) 169 - timeout = ns_to_ktime(call->peer->rtt * 3 / 2); 170 - else 171 - timeout = ms_to_ktime(rxrpc_resend_timeout); 172 - min_timeo = ns_to_ktime((1000000000 / HZ) * 4); 173 - if (ktime_before(timeout, min_timeo)) 174 - timeout = min_timeo; 168 + rto_j = call->peer->rto_j; 175 169 176 170 now = ktime_get_real(); 177 - max_age = ktime_sub(now, timeout); 171 + max_age = ktime_sub(now, jiffies_to_usecs(rto_j)); 178 172 179 173 spin_lock_bh(&call->lock); 180 174 ··· 213 219 } 214 220 215 221 resend_at = nsecs_to_jiffies(ktime_to_ns(ktime_sub(now, oldest))); 216 - resend_at += jiffies + rxrpc_resend_timeout; 222 + resend_at += jiffies + rto_j; 217 223 WRITE_ONCE(call->resend_at, resend_at); 218 224 219 225 if (unacked) ··· 228 234 rxrpc_timer_set_for_resend); 229 235 spin_unlock_bh(&call->lock); 230 236 ack_ts = ktime_sub(now, call->acks_latest_ts); 231 - if (ktime_to_ns(ack_ts) < call->peer->rtt) 237 + if (ktime_to_us(ack_ts) < (call->peer->srtt_us >> 3)) 232 238 goto out; 233 239 rxrpc_propose_ACK(call, RXRPC_ACK_PING, 0, true, false, 234 240 rxrpc_propose_ack_ping_for_lost_ack);
+37 -7
net/rxrpc/input.c
··· 91 91 /* We analyse the number of packets that get ACK'd per RTT 92 92 * period and increase the window if we managed to fill it. 93 93 */ 94 - if (call->peer->rtt_usage == 0) 94 + if (call->peer->rtt_count == 0) 95 95 goto out; 96 96 if (ktime_before(skb->tstamp, 97 - ktime_add_ns(call->cong_tstamp, 98 - call->peer->rtt))) 97 + ktime_add_us(call->cong_tstamp, 98 + call->peer->srtt_us >> 3))) 99 99 goto out_no_clear_ca; 100 100 change = rxrpc_cong_rtt_window_end; 101 101 call->cong_tstamp = skb->tstamp; ··· 803 803 } 804 804 805 805 /* 806 + * Return true if the ACK is valid - ie. it doesn't appear to have regressed 807 + * with respect to the ack state conveyed by preceding ACKs. 808 + */ 809 + static bool rxrpc_is_ack_valid(struct rxrpc_call *call, 810 + rxrpc_seq_t first_pkt, rxrpc_seq_t prev_pkt) 811 + { 812 + rxrpc_seq_t base = READ_ONCE(call->ackr_first_seq); 813 + 814 + if (after(first_pkt, base)) 815 + return true; /* The window advanced */ 816 + 817 + if (before(first_pkt, base)) 818 + return false; /* firstPacket regressed */ 819 + 820 + if (after_eq(prev_pkt, call->ackr_prev_seq)) 821 + return true; /* previousPacket hasn't regressed. */ 822 + 823 + /* Some rx implementations put a serial number in previousPacket. */ 824 + if (after_eq(prev_pkt, base + call->tx_winsize)) 825 + return false; 826 + return true; 827 + } 828 + 829 + /* 806 830 * Process an ACK packet. 807 831 * 808 832 * ack.firstPacket is the sequence number of the first soft-ACK'd/NAK'd packet ··· 889 865 } 890 866 891 867 /* Discard any out-of-order or duplicate ACKs (outside lock). */ 892 - if (before(first_soft_ack, call->ackr_first_seq) || 893 - before(prev_pkt, call->ackr_prev_seq)) 868 + if (!rxrpc_is_ack_valid(call, first_soft_ack, prev_pkt)) { 869 + trace_rxrpc_rx_discard_ack(call->debug_id, sp->hdr.serial, 870 + first_soft_ack, call->ackr_first_seq, 871 + prev_pkt, call->ackr_prev_seq); 894 872 return; 873 + } 895 874 896 875 buf.info.rxMTU = 0; 897 876 ioffset = offset + nr_acks + 3; ··· 905 878 spin_lock(&call->input_lock); 906 879 907 880 /* Discard any out-of-order or duplicate ACKs (inside lock). */ 908 - if (before(first_soft_ack, call->ackr_first_seq) || 909 - before(prev_pkt, call->ackr_prev_seq)) 881 + if (!rxrpc_is_ack_valid(call, first_soft_ack, prev_pkt)) { 882 + trace_rxrpc_rx_discard_ack(call->debug_id, sp->hdr.serial, 883 + first_soft_ack, call->ackr_first_seq, 884 + prev_pkt, call->ackr_prev_seq); 910 885 goto out; 886 + } 911 887 call->acks_latest_ts = skb->tstamp; 912 888 913 889 call->ackr_first_seq = first_soft_ack;
-5
net/rxrpc/misc.c
··· 63 63 */ 64 64 unsigned int rxrpc_rx_jumbo_max = 4; 65 65 66 - /* 67 - * Time till packet resend (in milliseconds). 68 - */ 69 - unsigned long rxrpc_resend_timeout = 4 * HZ; 70 - 71 66 const s8 rxrpc_ack_priority[] = { 72 67 [0] = 0, 73 68 [RXRPC_ACK_DELAY] = 1,
+3 -6
net/rxrpc/output.c
··· 369 369 (test_and_clear_bit(RXRPC_CALL_EV_ACK_LOST, &call->events) || 370 370 retrans || 371 371 call->cong_mode == RXRPC_CALL_SLOW_START || 372 - (call->peer->rtt_usage < 3 && sp->hdr.seq & 1) || 372 + (call->peer->rtt_count < 3 && sp->hdr.seq & 1) || 373 373 ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), 374 374 ktime_get_real()))) 375 375 whdr.flags |= RXRPC_REQUEST_ACK; ··· 423 423 if (whdr.flags & RXRPC_REQUEST_ACK) { 424 424 call->peer->rtt_last_req = skb->tstamp; 425 425 trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_data, serial); 426 - if (call->peer->rtt_usage > 1) { 426 + if (call->peer->rtt_count > 1) { 427 427 unsigned long nowj = jiffies, ack_lost_at; 428 428 429 - ack_lost_at = nsecs_to_jiffies(2 * call->peer->rtt); 430 - if (ack_lost_at < 1) 431 - ack_lost_at = 1; 432 - 429 + ack_lost_at = rxrpc_get_rto_backoff(call->peer, retrans); 433 430 ack_lost_at += nowj; 434 431 WRITE_ONCE(call->ack_lost_at, ack_lost_at); 435 432 rxrpc_reduce_call_timer(call, ack_lost_at, nowj,
-46
net/rxrpc/peer_event.c
··· 296 296 } 297 297 298 298 /* 299 - * Add RTT information to cache. This is called in softirq mode and has 300 - * exclusive access to the peer RTT data. 301 - */ 302 - void rxrpc_peer_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why, 303 - rxrpc_serial_t send_serial, rxrpc_serial_t resp_serial, 304 - ktime_t send_time, ktime_t resp_time) 305 - { 306 - struct rxrpc_peer *peer = call->peer; 307 - s64 rtt; 308 - u64 sum = peer->rtt_sum, avg; 309 - u8 cursor = peer->rtt_cursor, usage = peer->rtt_usage; 310 - 311 - rtt = ktime_to_ns(ktime_sub(resp_time, send_time)); 312 - if (rtt < 0) 313 - return; 314 - 315 - spin_lock(&peer->rtt_input_lock); 316 - 317 - /* Replace the oldest datum in the RTT buffer */ 318 - sum -= peer->rtt_cache[cursor]; 319 - sum += rtt; 320 - peer->rtt_cache[cursor] = rtt; 321 - peer->rtt_cursor = (cursor + 1) & (RXRPC_RTT_CACHE_SIZE - 1); 322 - peer->rtt_sum = sum; 323 - if (usage < RXRPC_RTT_CACHE_SIZE) { 324 - usage++; 325 - peer->rtt_usage = usage; 326 - } 327 - 328 - spin_unlock(&peer->rtt_input_lock); 329 - 330 - /* Now recalculate the average */ 331 - if (usage == RXRPC_RTT_CACHE_SIZE) { 332 - avg = sum / RXRPC_RTT_CACHE_SIZE; 333 - } else { 334 - avg = sum; 335 - do_div(avg, usage); 336 - } 337 - 338 - /* Don't need to update this under lock */ 339 - peer->rtt = avg; 340 - trace_rxrpc_rtt_rx(call, why, send_serial, resp_serial, rtt, 341 - usage, avg); 342 - } 343 - 344 - /* 345 299 * Perform keep-alive pings. 346 300 */ 347 301 static void rxrpc_peer_keepalive_dispatch(struct rxrpc_net *rxnet,
+7 -5
net/rxrpc/peer_object.c
··· 225 225 spin_lock_init(&peer->rtt_input_lock); 226 226 peer->debug_id = atomic_inc_return(&rxrpc_debug_id); 227 227 228 + rxrpc_peer_init_rtt(peer); 229 + 228 230 if (RXRPC_TX_SMSS > 2190) 229 231 peer->cong_cwnd = 2; 230 232 else if (RXRPC_TX_SMSS > 1095) ··· 499 497 EXPORT_SYMBOL(rxrpc_kernel_get_peer); 500 498 501 499 /** 502 - * rxrpc_kernel_get_rtt - Get a call's peer RTT 500 + * rxrpc_kernel_get_srtt - Get a call's peer smoothed RTT 503 501 * @sock: The socket on which the call is in progress. 504 502 * @call: The call to query 505 503 * 506 - * Get the call's peer RTT. 504 + * Get the call's peer smoothed RTT. 507 505 */ 508 - u64 rxrpc_kernel_get_rtt(struct socket *sock, struct rxrpc_call *call) 506 + u32 rxrpc_kernel_get_srtt(struct socket *sock, struct rxrpc_call *call) 509 507 { 510 - return call->peer->rtt; 508 + return call->peer->srtt_us >> 3; 511 509 } 512 - EXPORT_SYMBOL(rxrpc_kernel_get_rtt); 510 + EXPORT_SYMBOL(rxrpc_kernel_get_srtt);
+4 -4
net/rxrpc/proc.c
··· 222 222 seq_puts(seq, 223 223 "Proto Local " 224 224 " Remote " 225 - " Use CW MTU LastUse RTT Rc\n" 225 + " Use CW MTU LastUse RTT RTO\n" 226 226 ); 227 227 return 0; 228 228 } ··· 236 236 now = ktime_get_seconds(); 237 237 seq_printf(seq, 238 238 "UDP %-47.47s %-47.47s %3u" 239 - " %3u %5u %6llus %12llu %2u\n", 239 + " %3u %5u %6llus %8u %8u\n", 240 240 lbuff, 241 241 rbuff, 242 242 atomic_read(&peer->usage), 243 243 peer->cong_cwnd, 244 244 peer->mtu, 245 245 now - peer->last_tx_at, 246 - peer->rtt, 247 - peer->rtt_cursor); 246 + peer->srtt_us >> 3, 247 + jiffies_to_usecs(peer->rto_j)); 248 248 249 249 return 0; 250 250 }
+195
net/rxrpc/rtt.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* RTT/RTO calculation. 3 + * 4 + * Adapted from TCP for AF_RXRPC by David Howells (dhowells@redhat.com) 5 + * 6 + * https://tools.ietf.org/html/rfc6298 7 + * https://tools.ietf.org/html/rfc1122#section-4.2.3.1 8 + * http://ccr.sigcomm.org/archive/1995/jan95/ccr-9501-partridge87.pdf 9 + */ 10 + 11 + #include <linux/net.h> 12 + #include "ar-internal.h" 13 + 14 + #define RXRPC_RTO_MAX ((unsigned)(120 * HZ)) 15 + #define RXRPC_TIMEOUT_INIT ((unsigned)(1*HZ)) /* RFC6298 2.1 initial RTO value */ 16 + #define rxrpc_jiffies32 ((u32)jiffies) /* As rxrpc_jiffies32 */ 17 + #define rxrpc_min_rtt_wlen 300 /* As sysctl_tcp_min_rtt_wlen */ 18 + 19 + static u32 rxrpc_rto_min_us(struct rxrpc_peer *peer) 20 + { 21 + return 200; 22 + } 23 + 24 + static u32 __rxrpc_set_rto(const struct rxrpc_peer *peer) 25 + { 26 + return _usecs_to_jiffies((peer->srtt_us >> 3) + peer->rttvar_us); 27 + } 28 + 29 + static u32 rxrpc_bound_rto(u32 rto) 30 + { 31 + return min(rto, RXRPC_RTO_MAX); 32 + } 33 + 34 + /* 35 + * Called to compute a smoothed rtt estimate. The data fed to this 36 + * routine either comes from timestamps, or from segments that were 37 + * known _not_ to have been retransmitted [see Karn/Partridge 38 + * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88 39 + * piece by Van Jacobson. 40 + * NOTE: the next three routines used to be one big routine. 41 + * To save cycles in the RFC 1323 implementation it was better to break 42 + * it up into three procedures. -- erics 43 + */ 44 + static void rxrpc_rtt_estimator(struct rxrpc_peer *peer, long sample_rtt_us) 45 + { 46 + long m = sample_rtt_us; /* RTT */ 47 + u32 srtt = peer->srtt_us; 48 + 49 + /* The following amusing code comes from Jacobson's 50 + * article in SIGCOMM '88. Note that rtt and mdev 51 + * are scaled versions of rtt and mean deviation. 52 + * This is designed to be as fast as possible 53 + * m stands for "measurement". 54 + * 55 + * On a 1990 paper the rto value is changed to: 56 + * RTO = rtt + 4 * mdev 57 + * 58 + * Funny. This algorithm seems to be very broken. 59 + * These formulae increase RTO, when it should be decreased, increase 60 + * too slowly, when it should be increased quickly, decrease too quickly 61 + * etc. I guess in BSD RTO takes ONE value, so that it is absolutely 62 + * does not matter how to _calculate_ it. Seems, it was trap 63 + * that VJ failed to avoid. 8) 64 + */ 65 + if (srtt != 0) { 66 + m -= (srtt >> 3); /* m is now error in rtt est */ 67 + srtt += m; /* rtt = 7/8 rtt + 1/8 new */ 68 + if (m < 0) { 69 + m = -m; /* m is now abs(error) */ 70 + m -= (peer->mdev_us >> 2); /* similar update on mdev */ 71 + /* This is similar to one of Eifel findings. 72 + * Eifel blocks mdev updates when rtt decreases. 73 + * This solution is a bit different: we use finer gain 74 + * for mdev in this case (alpha*beta). 75 + * Like Eifel it also prevents growth of rto, 76 + * but also it limits too fast rto decreases, 77 + * happening in pure Eifel. 78 + */ 79 + if (m > 0) 80 + m >>= 3; 81 + } else { 82 + m -= (peer->mdev_us >> 2); /* similar update on mdev */ 83 + } 84 + 85 + peer->mdev_us += m; /* mdev = 3/4 mdev + 1/4 new */ 86 + if (peer->mdev_us > peer->mdev_max_us) { 87 + peer->mdev_max_us = peer->mdev_us; 88 + if (peer->mdev_max_us > peer->rttvar_us) 89 + peer->rttvar_us = peer->mdev_max_us; 90 + } 91 + } else { 92 + /* no previous measure. */ 93 + srtt = m << 3; /* take the measured time to be rtt */ 94 + peer->mdev_us = m << 1; /* make sure rto = 3*rtt */ 95 + peer->rttvar_us = max(peer->mdev_us, rxrpc_rto_min_us(peer)); 96 + peer->mdev_max_us = peer->rttvar_us; 97 + } 98 + 99 + peer->srtt_us = max(1U, srtt); 100 + } 101 + 102 + /* 103 + * Calculate rto without backoff. This is the second half of Van Jacobson's 104 + * routine referred to above. 105 + */ 106 + static void rxrpc_set_rto(struct rxrpc_peer *peer) 107 + { 108 + u32 rto; 109 + 110 + /* 1. If rtt variance happened to be less 50msec, it is hallucination. 111 + * It cannot be less due to utterly erratic ACK generation made 112 + * at least by solaris and freebsd. "Erratic ACKs" has _nothing_ 113 + * to do with delayed acks, because at cwnd>2 true delack timeout 114 + * is invisible. Actually, Linux-2.4 also generates erratic 115 + * ACKs in some circumstances. 116 + */ 117 + rto = __rxrpc_set_rto(peer); 118 + 119 + /* 2. Fixups made earlier cannot be right. 120 + * If we do not estimate RTO correctly without them, 121 + * all the algo is pure shit and should be replaced 122 + * with correct one. It is exactly, which we pretend to do. 123 + */ 124 + 125 + /* NOTE: clamping at RXRPC_RTO_MIN is not required, current algo 126 + * guarantees that rto is higher. 127 + */ 128 + peer->rto_j = rxrpc_bound_rto(rto); 129 + } 130 + 131 + static void rxrpc_ack_update_rtt(struct rxrpc_peer *peer, long rtt_us) 132 + { 133 + if (rtt_us < 0) 134 + return; 135 + 136 + //rxrpc_update_rtt_min(peer, rtt_us); 137 + rxrpc_rtt_estimator(peer, rtt_us); 138 + rxrpc_set_rto(peer); 139 + 140 + /* RFC6298: only reset backoff on valid RTT measurement. */ 141 + peer->backoff = 0; 142 + } 143 + 144 + /* 145 + * Add RTT information to cache. This is called in softirq mode and has 146 + * exclusive access to the peer RTT data. 147 + */ 148 + void rxrpc_peer_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why, 149 + rxrpc_serial_t send_serial, rxrpc_serial_t resp_serial, 150 + ktime_t send_time, ktime_t resp_time) 151 + { 152 + struct rxrpc_peer *peer = call->peer; 153 + s64 rtt_us; 154 + 155 + rtt_us = ktime_to_us(ktime_sub(resp_time, send_time)); 156 + if (rtt_us < 0) 157 + return; 158 + 159 + spin_lock(&peer->rtt_input_lock); 160 + rxrpc_ack_update_rtt(peer, rtt_us); 161 + if (peer->rtt_count < 3) 162 + peer->rtt_count++; 163 + spin_unlock(&peer->rtt_input_lock); 164 + 165 + trace_rxrpc_rtt_rx(call, why, send_serial, resp_serial, 166 + peer->srtt_us >> 3, peer->rto_j); 167 + } 168 + 169 + /* 170 + * Get the retransmission timeout to set in jiffies, backing it off each time 171 + * we retransmit. 172 + */ 173 + unsigned long rxrpc_get_rto_backoff(struct rxrpc_peer *peer, bool retrans) 174 + { 175 + u64 timo_j; 176 + u8 backoff = READ_ONCE(peer->backoff); 177 + 178 + timo_j = peer->rto_j; 179 + timo_j <<= backoff; 180 + if (retrans && timo_j * 2 <= RXRPC_RTO_MAX) 181 + WRITE_ONCE(peer->backoff, backoff + 1); 182 + 183 + if (timo_j < 1) 184 + timo_j = 1; 185 + 186 + return timo_j; 187 + } 188 + 189 + void rxrpc_peer_init_rtt(struct rxrpc_peer *peer) 190 + { 191 + peer->rto_j = RXRPC_TIMEOUT_INIT; 192 + peer->mdev_us = jiffies_to_usecs(RXRPC_TIMEOUT_INIT); 193 + peer->backoff = 0; 194 + //minmax_reset(&peer->rtt_min, rxrpc_jiffies32, ~0U); 195 + }
+9 -17
net/rxrpc/sendmsg.c
··· 66 66 struct rxrpc_call *call) 67 67 { 68 68 rxrpc_seq_t tx_start, tx_win; 69 - signed long rtt2, timeout; 70 - u64 rtt; 69 + signed long rtt, timeout; 71 70 72 - rtt = READ_ONCE(call->peer->rtt); 73 - rtt2 = nsecs_to_jiffies64(rtt) * 2; 74 - if (rtt2 < 2) 75 - rtt2 = 2; 71 + rtt = READ_ONCE(call->peer->srtt_us) >> 3; 72 + rtt = usecs_to_jiffies(rtt) * 2; 73 + if (rtt < 2) 74 + rtt = 2; 76 75 77 - timeout = rtt2; 76 + timeout = rtt; 78 77 tx_start = READ_ONCE(call->tx_hard_ack); 79 78 80 79 for (;;) { ··· 91 92 return -EINTR; 92 93 93 94 if (tx_win != tx_start) { 94 - timeout = rtt2; 95 + timeout = rtt; 95 96 tx_start = tx_win; 96 97 } 97 98 ··· 270 271 _debug("need instant resend %d", ret); 271 272 rxrpc_instant_resend(call, ix); 272 273 } else { 273 - unsigned long now = jiffies, resend_at; 274 + unsigned long now = jiffies; 275 + unsigned long resend_at = now + call->peer->rto_j; 274 276 275 - if (call->peer->rtt_usage > 1) 276 - resend_at = nsecs_to_jiffies(call->peer->rtt * 3 / 2); 277 - else 278 - resend_at = rxrpc_resend_timeout; 279 - if (resend_at < 1) 280 - resend_at = 1; 281 - 282 - resend_at += now; 283 277 WRITE_ONCE(call->resend_at, resend_at); 284 278 rxrpc_reduce_call_timer(call, resend_at, now, 285 279 rxrpc_timer_set_for_send);
-9
net/rxrpc/sysctl.c
··· 71 71 .extra1 = (void *)&one_jiffy, 72 72 .extra2 = (void *)&max_jiffies, 73 73 }, 74 - { 75 - .procname = "resend_timeout", 76 - .data = &rxrpc_resend_timeout, 77 - .maxlen = sizeof(unsigned long), 78 - .mode = 0644, 79 - .proc_handler = proc_doulongvec_ms_jiffies_minmax, 80 - .extra1 = (void *)&one_jiffy, 81 - .extra2 = (void *)&max_jiffies, 82 - }, 83 74 84 75 /* Non-time values */ 85 76 {