Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

rxrpc: Implement path-MTU probing using padded PING ACKs (RFC8899)

Implement path-MTU probing (along the lines of RFC8899) by padding some of
the PING ACKs we send. PING ACKs get their own individual responses quite
apart from the acking of data (though, as ACKs, they fulfil that role
also).

The probing concentrates on packet sizes that correspond how many
subpackets can be stuffed inside a jumbo packet as jumbo DATA packets are
just aggregations of individual DATA packets and can be split easily for
retransmission purposes.

If we want to perform probing, we advertise this by setting the maximum
number of jumbo subpackets to 0 in the ack trailer when we send an ACK and
see if the peer is also advertising the service. This is interpreted by
non-supporting Rx stacks as an indication that jumbo packets aren't
supported.

The MTU sizes advertised in the ACK trailer AF_RXRPC transmits are pegged
at a maximum of 1444 unless pmtud is supported by both sides.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Marc Dionne <marc.dionne@auristor.com>
cc: linux-afs@lists.infradead.org
Link: https://patch.msgid.link/20241204074710.990092-10-dhowells@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

authored by

David Howells and committed by
Jakub Kicinski
eeaedc54 420f8af5

+382 -57
+124
include/trace/events/rxrpc.h
··· 364 364 EM(rxrpc_propose_ack_ping_for_lost_ack, "LostAck") \ 365 365 EM(rxrpc_propose_ack_ping_for_lost_reply, "LostRpl") \ 366 366 EM(rxrpc_propose_ack_ping_for_0_retrans, "0-Retrn") \ 367 + EM(rxrpc_propose_ack_ping_for_mtu_probe, "MTUProb") \ 367 368 EM(rxrpc_propose_ack_ping_for_old_rtt, "OldRtt ") \ 368 369 EM(rxrpc_propose_ack_ping_for_params, "Params ") \ 369 370 EM(rxrpc_propose_ack_ping_for_rtt, "Rtt ") \ ··· 479 478 EM(rxrpc_txbuf_see_send_more, "SEE SEND+ ") \ 480 479 E_(rxrpc_txbuf_see_unacked, "SEE UNACKED") 481 480 481 + #define rxrpc_pmtud_reduce_traces \ 482 + EM(rxrpc_pmtud_reduce_ack, "Ack ") \ 483 + EM(rxrpc_pmtud_reduce_icmp, "Icmp ") \ 484 + E_(rxrpc_pmtud_reduce_route, "Route") 485 + 482 486 /* 483 487 * Generate enums for tracing information. 484 488 */ ··· 504 498 enum rxrpc_conn_trace { rxrpc_conn_traces } __mode(byte); 505 499 enum rxrpc_local_trace { rxrpc_local_traces } __mode(byte); 506 500 enum rxrpc_peer_trace { rxrpc_peer_traces } __mode(byte); 501 + enum rxrpc_pmtud_reduce_trace { rxrpc_pmtud_reduce_traces } __mode(byte); 507 502 enum rxrpc_propose_ack_outcome { rxrpc_propose_ack_outcomes } __mode(byte); 508 503 enum rxrpc_propose_ack_trace { rxrpc_propose_ack_traces } __mode(byte); 509 504 enum rxrpc_receive_trace { rxrpc_receive_traces } __mode(byte); ··· 541 534 rxrpc_congest_modes; 542 535 rxrpc_conn_traces; 543 536 rxrpc_local_traces; 537 + rxrpc_pmtud_reduce_traces; 544 538 rxrpc_propose_ack_traces; 545 539 rxrpc_receive_traces; 546 540 rxrpc_recvmsg_traces; ··· 2046 2038 __entry->seq, 2047 2039 __print_symbolic(__entry->what, rxrpc_sack_traces), 2048 2040 __entry->sack) 2041 + ); 2042 + 2043 + TRACE_EVENT(rxrpc_pmtud_tx, 2044 + TP_PROTO(struct rxrpc_call *call), 2045 + 2046 + TP_ARGS(call), 2047 + 2048 + TP_STRUCT__entry( 2049 + __field(unsigned int, peer_debug_id) 2050 + __field(unsigned int, call_debug_id) 2051 + __field(rxrpc_serial_t, ping_serial) 2052 + __field(unsigned short, pmtud_trial) 2053 + __field(unsigned short, pmtud_good) 2054 + __field(unsigned short, pmtud_bad) 2055 + ), 2056 + 2057 + TP_fast_assign( 2058 + __entry->peer_debug_id = call->peer->debug_id; 2059 + __entry->call_debug_id = call->debug_id; 2060 + __entry->ping_serial = call->conn->pmtud_probe; 2061 + __entry->pmtud_trial = call->peer->pmtud_trial; 2062 + __entry->pmtud_good = call->peer->pmtud_good; 2063 + __entry->pmtud_bad = call->peer->pmtud_bad; 2064 + ), 2065 + 2066 + TP_printk("P=%08x c=%08x pr=%08x %u-%u-%u", 2067 + __entry->peer_debug_id, 2068 + __entry->call_debug_id, 2069 + __entry->ping_serial, 2070 + __entry->pmtud_good, 2071 + __entry->pmtud_trial, 2072 + __entry->pmtud_bad) 2073 + ); 2074 + 2075 + TRACE_EVENT(rxrpc_pmtud_rx, 2076 + TP_PROTO(struct rxrpc_connection *conn, rxrpc_serial_t resp_serial), 2077 + 2078 + TP_ARGS(conn, resp_serial), 2079 + 2080 + TP_STRUCT__entry( 2081 + __field(unsigned int, peer_debug_id) 2082 + __field(unsigned int, call_debug_id) 2083 + __field(rxrpc_serial_t, ping_serial) 2084 + __field(rxrpc_serial_t, resp_serial) 2085 + __field(unsigned short, max_data) 2086 + __field(u8, jumbo_max) 2087 + ), 2088 + 2089 + TP_fast_assign( 2090 + __entry->peer_debug_id = conn->peer->debug_id; 2091 + __entry->call_debug_id = conn->pmtud_call; 2092 + __entry->ping_serial = conn->pmtud_probe; 2093 + __entry->resp_serial = resp_serial; 2094 + __entry->max_data = conn->peer->max_data; 2095 + __entry->jumbo_max = conn->peer->pmtud_jumbo; 2096 + ), 2097 + 2098 + TP_printk("P=%08x c=%08x pr=%08x rr=%08x max=%u jm=%u", 2099 + __entry->peer_debug_id, 2100 + __entry->call_debug_id, 2101 + __entry->ping_serial, 2102 + __entry->resp_serial, 2103 + __entry->max_data, 2104 + __entry->jumbo_max) 2105 + ); 2106 + 2107 + TRACE_EVENT(rxrpc_pmtud_lost, 2108 + TP_PROTO(struct rxrpc_connection *conn, rxrpc_serial_t resp_serial), 2109 + 2110 + TP_ARGS(conn, resp_serial), 2111 + 2112 + TP_STRUCT__entry( 2113 + __field(unsigned int, peer_debug_id) 2114 + __field(unsigned int, call_debug_id) 2115 + __field(rxrpc_serial_t, ping_serial) 2116 + __field(rxrpc_serial_t, resp_serial) 2117 + ), 2118 + 2119 + TP_fast_assign( 2120 + __entry->peer_debug_id = conn->peer->debug_id; 2121 + __entry->call_debug_id = conn->pmtud_call; 2122 + __entry->ping_serial = conn->pmtud_probe; 2123 + __entry->resp_serial = resp_serial; 2124 + ), 2125 + 2126 + TP_printk("P=%08x c=%08x pr=%08x rr=%08x", 2127 + __entry->peer_debug_id, 2128 + __entry->call_debug_id, 2129 + __entry->ping_serial, 2130 + __entry->resp_serial) 2131 + ); 2132 + 2133 + TRACE_EVENT(rxrpc_pmtud_reduce, 2134 + TP_PROTO(struct rxrpc_peer *peer, rxrpc_serial_t serial, 2135 + unsigned int max_data, enum rxrpc_pmtud_reduce_trace reason), 2136 + 2137 + TP_ARGS(peer, serial, max_data, reason), 2138 + 2139 + TP_STRUCT__entry( 2140 + __field(unsigned int, peer_debug_id) 2141 + __field(rxrpc_serial_t, serial) 2142 + __field(unsigned int, max_data) 2143 + __field(enum rxrpc_pmtud_reduce_trace, reason) 2144 + ), 2145 + 2146 + TP_fast_assign( 2147 + __entry->peer_debug_id = peer->debug_id; 2148 + __entry->serial = serial; 2149 + __entry->max_data = max_data; 2150 + __entry->reason = reason; 2151 + ), 2152 + 2153 + TP_printk("P=%08x %s r=%08x m=%u", 2154 + __entry->peer_debug_id, 2155 + __print_symbolic(__entry->reason, rxrpc_pmtud_reduce_traces), 2156 + __entry->serial, __entry->max_data) 2049 2157 ); 2050 2158 2051 2159 #undef EM
+21 -4
net/rxrpc/ar-internal.h
··· 344 344 time64_t last_tx_at; /* Last time packet sent here */ 345 345 seqlock_t service_conn_lock; 346 346 spinlock_t lock; /* access lock */ 347 - unsigned int if_mtu; /* interface MTU for this peer */ 348 - unsigned int mtu; /* network MTU for this peer */ 349 - unsigned int maxdata; /* data size (MTU - hdrsize) */ 350 - unsigned short hdrsize; /* header size (IP + UDP + RxRPC) */ 351 347 int debug_id; /* debug ID for printks */ 352 348 struct sockaddr_rxrpc srx; /* remote address */ 349 + 350 + /* Path MTU discovery [RFC8899] */ 351 + unsigned int pmtud_trial; /* Current MTU probe size */ 352 + unsigned int pmtud_good; /* Largest working MTU probe we've tried */ 353 + unsigned int pmtud_bad; /* Smallest non-working MTU probe we've tried */ 354 + bool pmtud_lost; /* T if MTU probe was lost */ 355 + bool pmtud_probing; /* T if we have an active probe outstanding */ 356 + bool pmtud_pending; /* T if a call to this peer should send a probe */ 357 + u8 pmtud_jumbo; /* Max jumbo packets for the MTU */ 358 + bool ackr_adv_pmtud; /* T if the peer advertises path-MTU */ 359 + unsigned int ackr_max_data; /* Maximum data advertised by peer */ 360 + seqcount_t mtu_lock; /* Lockless MTU access management */ 361 + unsigned int if_mtu; /* Local interface MTU (- hdrsize) for this peer */ 362 + unsigned int max_data; /* Maximum packet data capacity for this peer */ 363 + unsigned short hdrsize; /* header size (IP + UDP + RxRPC) */ 364 + unsigned short tx_seg_max; /* Maximum number of transmissable segments */ 353 365 354 366 /* calculated RTT cache */ 355 367 #define RXRPC_RTT_CACHE_SIZE 32 ··· 543 531 int debug_id; /* debug ID for printks */ 544 532 rxrpc_serial_t tx_serial; /* Outgoing packet serial number counter */ 545 533 unsigned int hi_serial; /* highest serial number received */ 534 + rxrpc_serial_t pmtud_probe; /* Serial of MTU probe (or 0) */ 535 + unsigned int pmtud_call; /* ID of call used for probe */ 546 536 u32 service_id; /* Service ID, possibly upgraded */ 547 537 u32 security_level; /* Security level selected */ 548 538 u8 security_ix; /* security type */ ··· 1169 1155 */ 1170 1156 void rxrpc_send_ACK(struct rxrpc_call *call, u8 ack_reason, 1171 1157 rxrpc_serial_t serial, enum rxrpc_propose_ack_trace why); 1158 + void rxrpc_send_probe_for_pmtud(struct rxrpc_call *call); 1172 1159 int rxrpc_send_abort_packet(struct rxrpc_call *); 1173 1160 void rxrpc_send_conn_abort(struct rxrpc_connection *conn); 1174 1161 void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb); ··· 1181 1166 */ 1182 1167 void rxrpc_input_error(struct rxrpc_local *, struct sk_buff *); 1183 1168 void rxrpc_peer_keepalive_worker(struct work_struct *); 1169 + void rxrpc_input_probe_for_pmtud(struct rxrpc_connection *conn, rxrpc_serial_t acked_serial, 1170 + bool sendmsg_fail); 1184 1171 1185 1172 /* 1186 1173 * peer_object.c
+5
net/rxrpc/call_event.c
··· 483 483 rxrpc_disconnect_call(call); 484 484 if (call->security) 485 485 call->security->free_call_crypto(call); 486 + } else { 487 + if (skb && 488 + call->peer->ackr_adv_pmtud && 489 + call->peer->pmtud_pending) 490 + rxrpc_send_probe_for_pmtud(call); 486 491 } 487 492 if (call->acks_hard_ack != call->tx_bottom) 488 493 rxrpc_shrink_call_tx_buffer(call);
+11 -6
net/rxrpc/conn_event.c
··· 92 92 struct rxrpc_acktrailer trailer; 93 93 size_t len; 94 94 int ret, ioc; 95 - u32 serial, mtu, call_id, padding; 95 + u32 serial, max_mtu, if_mtu, call_id, padding; 96 96 97 97 _enter("%d", conn->debug_id); 98 98 ··· 150 150 break; 151 151 152 152 case RXRPC_PACKET_TYPE_ACK: 153 - mtu = conn->peer->if_mtu; 154 - mtu -= conn->peer->hdrsize; 153 + if_mtu = conn->peer->if_mtu - conn->peer->hdrsize; 154 + if (conn->peer->ackr_adv_pmtud) { 155 + max_mtu = umax(conn->peer->max_data, rxrpc_rx_mtu); 156 + } else { 157 + if_mtu = umin(1444, if_mtu); 158 + max_mtu = if_mtu; 159 + } 155 160 pkt.ack.bufferSpace = 0; 156 161 pkt.ack.maxSkew = htons(skb ? skb->priority : 0); 157 162 pkt.ack.firstPacket = htonl(chan->last_seq + 1); ··· 164 159 pkt.ack.serial = htonl(skb ? sp->hdr.serial : 0); 165 160 pkt.ack.reason = skb ? RXRPC_ACK_DUPLICATE : RXRPC_ACK_IDLE; 166 161 pkt.ack.nAcks = 0; 167 - trailer.maxMTU = htonl(rxrpc_rx_mtu); 168 - trailer.ifMTU = htonl(mtu); 162 + trailer.maxMTU = htonl(max_mtu); 163 + trailer.ifMTU = htonl(if_mtu); 169 164 trailer.rwind = htonl(rxrpc_rx_window_size); 170 - trailer.jumbo_max = htonl(rxrpc_rx_jumbo_max); 165 + trailer.jumbo_max = 0; 171 166 pkt.whdr.flags |= RXRPC_SLOW_START_OK; 172 167 padding = 0; 173 168 iov[0].iov_len += sizeof(pkt.ack);
+6
net/rxrpc/conn_object.c
··· 321 321 list_del_init(&conn->proc_link); 322 322 write_unlock(&rxnet->conn_lock); 323 323 324 + if (conn->pmtud_probe) { 325 + trace_rxrpc_pmtud_lost(conn, 0); 326 + conn->peer->pmtud_probing = false; 327 + conn->peer->pmtud_pending = true; 328 + } 329 + 324 330 rxrpc_purge_queue(&conn->rx_queue); 325 331 326 332 rxrpc_kill_client_conn(conn);
+17 -9
net/rxrpc/input.c
··· 692 692 struct rxrpc_acktrailer *trailer) 693 693 { 694 694 struct rxrpc_skb_priv *sp = rxrpc_skb(skb); 695 - struct rxrpc_peer *peer; 696 - unsigned int mtu; 695 + struct rxrpc_peer *peer = call->peer; 696 + unsigned int max_data; 697 697 bool wake = false; 698 698 u32 rwind = ntohl(trailer->rwind); 699 699 ··· 706 706 call->tx_winsize = rwind; 707 707 } 708 708 709 - mtu = umin(ntohl(trailer->maxMTU), ntohl(trailer->ifMTU)); 709 + if (trailer->jumbo_max == 0) { 710 + /* The peer says it supports pmtu discovery */ 711 + peer->ackr_adv_pmtud = true; 712 + } else { 713 + peer->ackr_adv_pmtud = false; 714 + } 710 715 711 - peer = call->peer; 712 - if (mtu < peer->maxdata) { 713 - spin_lock(&peer->lock); 714 - peer->maxdata = mtu; 715 - peer->mtu = mtu + peer->hdrsize; 716 - spin_unlock(&peer->lock); 716 + max_data = ntohl(trailer->maxMTU); 717 + peer->ackr_max_data = max_data; 718 + 719 + if (max_data < peer->max_data) { 720 + trace_rxrpc_pmtud_reduce(peer, sp->hdr.serial, max_data, 721 + rxrpc_pmtud_reduce_ack); 722 + write_seqcount_begin(&peer->mtu_lock); 723 + peer->max_data = max_data; 724 + write_seqcount_end(&peer->mtu_lock); 717 725 } 718 726 719 727 if (wake)
+6
net/rxrpc/io_thread.c
··· 364 364 if (sp->hdr.callNumber == 0) 365 365 return rxrpc_input_conn_packet(conn, skb); 366 366 367 + /* Deal with path MTU discovery probing. */ 368 + if (sp->hdr.type == RXRPC_PACKET_TYPE_ACK && 369 + conn->pmtud_probe && 370 + after_eq(sp->ack.acked_serial, conn->pmtud_probe)) 371 + rxrpc_input_probe_for_pmtud(conn, sp->ack.acked_serial, false); 372 + 367 373 /* Call-bound packets are routed by connection channel. */ 368 374 channel = sp->hdr.cid & RXRPC_CHANNELMASK; 369 375 chan = &conn->channels[channel];
+2 -2
net/rxrpc/misc.c
··· 46 46 * Maximum Rx MTU size. This indicates to the sender the size of jumbo packet 47 47 * made by gluing normal packets together that we're willing to handle. 48 48 */ 49 - unsigned int rxrpc_rx_mtu = 5692; 49 + unsigned int rxrpc_rx_mtu = RXRPC_JUMBO(46); 50 50 51 51 /* 52 52 * The maximum number of fragments in a received jumbo packet that we tell the 53 53 * sender that we're willing to handle. 54 54 */ 55 - unsigned int rxrpc_rx_jumbo_max = 4; 55 + unsigned int rxrpc_rx_jumbo_max = 46; 56 56 57 57 #ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY 58 58 /*
+53 -14
net/rxrpc/output.c
··· 82 82 struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base; 83 83 struct rxrpc_acktrailer *trailer = txb->kvec[2].iov_base + 3; 84 84 struct rxrpc_ackpacket *ack = (struct rxrpc_ackpacket *)(whdr + 1); 85 - unsigned int qsize, sack, wrap, to; 85 + unsigned int qsize, sack, wrap, to, max_mtu, if_mtu; 86 86 rxrpc_seq_t window, wtop; 87 87 int rsize; 88 - u32 mtu, jmax; 89 88 u8 *filler = txb->kvec[2].iov_base; 90 89 u8 *sackp = txb->kvec[1].iov_base; 91 90 ··· 131 132 ack->reason = RXRPC_ACK_IDLE; 132 133 } 133 134 134 - mtu = call->peer->if_mtu; 135 - mtu -= call->peer->hdrsize; 136 - jmax = rxrpc_rx_jumbo_max; 137 135 qsize = (window - 1) - call->rx_consumed; 138 136 rsize = max_t(int, call->rx_winsize - qsize, 0); 139 137 txb->ack_rwind = rsize; 140 - trailer->maxMTU = htonl(rxrpc_rx_mtu); 141 - trailer->ifMTU = htonl(mtu); 138 + 139 + if_mtu = call->peer->if_mtu - call->peer->hdrsize; 140 + if (call->peer->ackr_adv_pmtud) { 141 + max_mtu = umax(call->peer->max_data, rxrpc_rx_mtu); 142 + } else { 143 + if_mtu = umin(if_mtu, 1444); 144 + max_mtu = if_mtu; 145 + } 146 + 147 + trailer->maxMTU = htonl(max_mtu); 148 + trailer->ifMTU = htonl(if_mtu); 142 149 trailer->rwind = htonl(rsize); 143 - trailer->jumbo_max = htonl(jmax); 150 + trailer->jumbo_max = 0; /* Advertise pmtu discovery */ 144 151 } 145 152 146 153 /* ··· 181 176 * Transmit an ACK packet. 182 177 */ 183 178 static void rxrpc_send_ack_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb, 184 - int nr_kv) 179 + int nr_kv, enum rxrpc_propose_ack_trace why) 185 180 { 186 181 struct kvec *kv = call->local->kvec; 187 182 struct rxrpc_wire_header *whdr = kv[0].iov_base; ··· 214 209 rxrpc_inc_stat(call->rxnet, stat_tx_ack_send); 215 210 216 211 iov_iter_kvec(&msg.msg_iter, WRITE, kv, nr_kv, txb->len); 217 - rxrpc_local_dont_fragment(conn->local, false); 212 + rxrpc_local_dont_fragment(conn->local, why == rxrpc_propose_ack_ping_for_mtu_probe); 218 213 219 214 ret = do_udp_sendmsg(conn->local->socket, &msg, txb->len); 220 215 call->peer->last_tx_at = ktime_get_seconds(); 221 216 if (ret < 0) { 222 217 trace_rxrpc_tx_fail(call->debug_id, txb->serial, ret, 223 218 rxrpc_tx_point_call_ack); 219 + if (why == rxrpc_propose_ack_ping_for_mtu_probe && 220 + ret == -EMSGSIZE) 221 + rxrpc_input_probe_for_pmtud(conn, txb->serial, true); 224 222 } else { 225 223 trace_rxrpc_tx_packet(call->debug_id, whdr, 226 224 rxrpc_tx_point_call_ack); ··· 233 225 if (txb->flags & RXRPC_REQUEST_ACK) 234 226 call->peer->rtt_last_req = now; 235 227 rxrpc_set_keepalive(call, now); 228 + if (why == rxrpc_propose_ack_ping_for_mtu_probe) { 229 + call->peer->pmtud_pending = false; 230 + call->peer->pmtud_probing = true; 231 + call->conn->pmtud_probe = txb->serial; 232 + call->conn->pmtud_call = call->debug_id; 233 + trace_rxrpc_pmtud_tx(call); 234 + } 236 235 } 237 236 rxrpc_tx_backoff(call, ret); 238 237 } ··· 269 254 270 255 rxrpc_fill_out_ack(call, txb, ack_reason, serial); 271 256 257 + /* Extend a path MTU probe ACK. */ 272 258 nr_kv = txb->nr_kvec; 273 259 kv[0] = txb->kvec[0]; 274 260 kv[1] = txb->kvec[1]; 275 261 kv[2] = txb->kvec[2]; 276 - // TODO: Extend a path MTU probe ACK 262 + if (why == rxrpc_propose_ack_ping_for_mtu_probe) { 263 + size_t probe_mtu = call->peer->pmtud_trial + sizeof(struct rxrpc_wire_header); 264 + 265 + if (txb->len > probe_mtu) 266 + goto skip; 267 + while (txb->len < probe_mtu) { 268 + size_t part = umin(probe_mtu - txb->len, PAGE_SIZE); 269 + 270 + kv[nr_kv].iov_base = page_address(ZERO_PAGE(0)); 271 + kv[nr_kv].iov_len = part; 272 + txb->len += part; 273 + nr_kv++; 274 + } 275 + } 277 276 278 277 call->ackr_nr_unacked = 0; 279 278 atomic_set(&call->ackr_nr_consumed, 0); 280 279 clear_bit(RXRPC_CALL_RX_IS_IDLE, &call->flags); 281 280 282 281 trace_rxrpc_send_ack(call, why, ack_reason, serial); 283 - rxrpc_send_ack_packet(call, txb, nr_kv); 282 + rxrpc_send_ack_packet(call, txb, nr_kv, why); 283 + skip: 284 284 rxrpc_put_txbuf(txb, rxrpc_txbuf_put_ack_tx); 285 + } 286 + 287 + /* 288 + * Send an ACK probe for path MTU discovery. 289 + */ 290 + void rxrpc_send_probe_for_pmtud(struct rxrpc_call *call) 291 + { 292 + rxrpc_send_ACK(call, RXRPC_ACK_PING, 0, 293 + rxrpc_propose_ack_ping_for_mtu_probe); 285 294 } 286 295 287 296 /* ··· 540 501 541 502 /* send the packet with the don't fragment bit set if we currently 542 503 * think it's small enough */ 543 - if (len >= sizeof(struct rxrpc_wire_header) + call->peer->maxdata) { 504 + if (len >= sizeof(struct rxrpc_wire_header) + call->peer->max_data) { 544 505 rxrpc_local_dont_fragment(conn->local, false); 545 506 frag = rxrpc_tx_point_call_data_frag; 546 507 } else { ··· 587 548 RX_USER_ABORT, ret); 588 549 } 589 550 590 - _leave(" = %d [%u]", ret, call->peer->maxdata); 551 + _leave(" = %d [%u]", ret, call->peer->max_data); 591 552 return ret; 592 553 } 593 554
+99 -5
net/rxrpc/peer_event.c
··· 102 102 */ 103 103 static void rxrpc_adjust_mtu(struct rxrpc_peer *peer, unsigned int mtu) 104 104 { 105 + unsigned int max_data; 106 + 105 107 /* wind down the local interface MTU */ 106 108 if (mtu > 0 && peer->if_mtu == 65535 && mtu < peer->if_mtu) 107 109 peer->if_mtu = mtu; ··· 122 120 } 123 121 } 124 122 125 - if (mtu < peer->mtu) { 126 - spin_lock(&peer->lock); 127 - peer->mtu = mtu; 128 - peer->maxdata = peer->mtu - peer->hdrsize; 129 - spin_unlock(&peer->lock); 123 + max_data = max_t(int, mtu - peer->hdrsize, 500); 124 + if (max_data < peer->max_data) { 125 + if (peer->pmtud_good > max_data) 126 + peer->pmtud_good = max_data; 127 + if (peer->pmtud_bad > max_data + 1) 128 + peer->pmtud_bad = max_data + 1; 129 + 130 + trace_rxrpc_pmtud_reduce(peer, 0, max_data, rxrpc_pmtud_reduce_icmp); 131 + write_seqcount_begin(&peer->mtu_lock); 132 + peer->max_data = max_data; 133 + write_seqcount_end(&peer->mtu_lock); 130 134 } 131 135 } 132 136 ··· 354 346 timer_reduce(&rxnet->peer_keepalive_timer, jiffies + delay); 355 347 356 348 _leave(""); 349 + } 350 + 351 + /* 352 + * Do path MTU probing. 353 + */ 354 + void rxrpc_input_probe_for_pmtud(struct rxrpc_connection *conn, rxrpc_serial_t acked_serial, 355 + bool sendmsg_fail) 356 + { 357 + struct rxrpc_peer *peer = conn->peer; 358 + unsigned int max_data = peer->max_data; 359 + int good, trial, bad, jumbo; 360 + 361 + good = peer->pmtud_good; 362 + trial = peer->pmtud_trial; 363 + bad = peer->pmtud_bad; 364 + if (good >= bad - 1) { 365 + conn->pmtud_probe = 0; 366 + peer->pmtud_lost = false; 367 + return; 368 + } 369 + 370 + if (!peer->pmtud_probing) 371 + goto send_probe; 372 + 373 + if (sendmsg_fail || after(acked_serial, conn->pmtud_probe)) { 374 + /* Retry a lost probe. */ 375 + if (!peer->pmtud_lost) { 376 + trace_rxrpc_pmtud_lost(conn, acked_serial); 377 + conn->pmtud_probe = 0; 378 + peer->pmtud_lost = true; 379 + goto send_probe; 380 + } 381 + 382 + /* The probed size didn't seem to get through. */ 383 + bad = trial; 384 + peer->pmtud_bad = bad; 385 + if (bad <= max_data) 386 + max_data = bad - 1; 387 + } else { 388 + /* It did get through. */ 389 + good = trial; 390 + peer->pmtud_good = good; 391 + if (good > max_data) 392 + max_data = good; 393 + } 394 + 395 + max_data = umin(max_data, peer->ackr_max_data); 396 + if (max_data != peer->max_data) { 397 + preempt_disable(); 398 + write_seqcount_begin(&peer->mtu_lock); 399 + peer->max_data = max_data; 400 + write_seqcount_end(&peer->mtu_lock); 401 + preempt_enable(); 402 + } 403 + 404 + jumbo = max_data + sizeof(struct rxrpc_jumbo_header); 405 + jumbo /= RXRPC_JUMBO_SUBPKTLEN; 406 + peer->pmtud_jumbo = jumbo; 407 + 408 + trace_rxrpc_pmtud_rx(conn, acked_serial); 409 + conn->pmtud_probe = 0; 410 + peer->pmtud_lost = false; 411 + 412 + if (good < RXRPC_JUMBO(2) && bad > RXRPC_JUMBO(2)) 413 + trial = RXRPC_JUMBO(2); 414 + else if (good < RXRPC_JUMBO(4) && bad > RXRPC_JUMBO(4)) 415 + trial = RXRPC_JUMBO(4); 416 + else if (good < RXRPC_JUMBO(3) && bad > RXRPC_JUMBO(3)) 417 + trial = RXRPC_JUMBO(3); 418 + else if (good < RXRPC_JUMBO(6) && bad > RXRPC_JUMBO(6)) 419 + trial = RXRPC_JUMBO(6); 420 + else if (good < RXRPC_JUMBO(5) && bad > RXRPC_JUMBO(5)) 421 + trial = RXRPC_JUMBO(5); 422 + else if (good < RXRPC_JUMBO(8) && bad > RXRPC_JUMBO(8)) 423 + trial = RXRPC_JUMBO(8); 424 + else if (good < RXRPC_JUMBO(7) && bad > RXRPC_JUMBO(7)) 425 + trial = RXRPC_JUMBO(7); 426 + else 427 + trial = (good + bad) / 2; 428 + peer->pmtud_trial = trial; 429 + 430 + if (good >= bad) 431 + return; 432 + 433 + send_probe: 434 + peer->pmtud_pending = true; 357 435 }
+20 -4
net/rxrpc/peer_object.c
··· 162 162 #endif 163 163 164 164 peer->if_mtu = 1500; 165 + if (peer->max_data < peer->if_mtu - peer->hdrsize) { 166 + trace_rxrpc_pmtud_reduce(peer, 0, peer->if_mtu - peer->hdrsize, 167 + rxrpc_pmtud_reduce_route); 168 + peer->max_data = peer->if_mtu - peer->hdrsize; 169 + } 165 170 166 171 memset(&fl, 0, sizeof(fl)); 167 172 switch (peer->srx.transport.family) { ··· 204 199 } 205 200 206 201 peer->if_mtu = dst_mtu(dst); 202 + peer->hdrsize += dst->header_len + dst->trailer_len; 203 + peer->tx_seg_max = dst->dev->gso_max_segs; 207 204 dst_release(dst); 205 + 206 + peer->max_data = umin(RXRPC_JUMBO(1), peer->if_mtu - peer->hdrsize); 207 + peer->pmtud_good = 500; 208 + peer->pmtud_bad = peer->if_mtu - peer->hdrsize + 1; 209 + peer->pmtud_trial = umin(peer->max_data, peer->pmtud_bad - 1); 210 + peer->pmtud_pending = true; 208 211 209 212 _leave(" [if_mtu %u]", peer->if_mtu); 210 213 } ··· 236 223 seqlock_init(&peer->service_conn_lock); 237 224 spin_lock_init(&peer->lock); 238 225 spin_lock_init(&peer->rtt_input_lock); 226 + seqcount_init(&peer->mtu_lock); 239 227 peer->debug_id = atomic_inc_return(&rxrpc_debug_id); 240 228 241 229 rxrpc_peer_init_rtt(peer); ··· 256 242 unsigned long hash_key) 257 243 { 258 244 peer->hash_key = hash_key; 259 - rxrpc_assess_MTU_size(local, peer); 260 - peer->mtu = peer->if_mtu; 261 - peer->rtt_last_req = ktime_get_real(); 245 + 262 246 263 247 switch (peer->srx.transport.family) { 264 248 case AF_INET: ··· 280 268 } 281 269 282 270 peer->hdrsize += sizeof(struct rxrpc_wire_header); 283 - peer->maxdata = peer->mtu - peer->hdrsize; 271 + peer->max_data = peer->if_mtu - peer->hdrsize; 272 + 273 + rxrpc_assess_MTU_size(local, peer); 274 + 275 + peer->rtt_last_req = ktime_get_real(); 284 276 } 285 277 286 278 /*
+3 -6
net/rxrpc/proc.c
··· 283 283 284 284 if (v == SEQ_START_TOKEN) { 285 285 seq_puts(seq, 286 - "Proto Local " 287 - " Remote " 288 - " Use SST MTU LastUse RTT RTO\n" 286 + "Proto Local Remote Use SST Maxd LastUse RTT RTO\n" 289 287 ); 290 288 return 0; 291 289 } ··· 296 298 297 299 now = ktime_get_seconds(); 298 300 seq_printf(seq, 299 - "UDP %-47.47s %-47.47s %3u" 300 - " %3u %5u %6llus %8u %8u\n", 301 + "UDP %-47.47s %-47.47s %3u %4u %5u %6llus %8u %8u\n", 301 302 lbuff, 302 303 rbuff, 303 304 refcount_read(&peer->ref), 304 305 peer->cong_ssthresh, 305 - peer->mtu, 306 + peer->max_data, 306 307 now - peer->last_tx_at, 307 308 peer->srtt_us >> 3, 308 309 peer->rto_us);
+9 -4
net/rxrpc/protocol.h
··· 92 92 /* 93 93 * The maximum number of subpackets that can possibly fit in a UDP packet is: 94 94 * 95 - * ((max_IP - IP_hdr - UDP_hdr) / RXRPC_JUMBO_SUBPKTLEN) + 1 96 - * = ((65535 - 28 - 28) / 1416) + 1 97 - * = 46 non-terminal packets and 1 terminal packet. 95 + * (max_UDP - wirehdr + jumbohdr) / (jumbohdr + 1412) 96 + * = ((65535 - 28 + 4) / 1416) 97 + * = 45 non-terminal packets and 1 terminal packet. 98 98 */ 99 - #define RXRPC_MAX_NR_JUMBO 47 99 + #define RXRPC_MAX_NR_JUMBO 46 100 + 101 + /* Size of a jumbo packet with N subpackets, excluding UDP+IP */ 102 + #define RXRPC_JUMBO(N) ((int)sizeof(struct rxrpc_wire_header) + \ 103 + RXRPC_JUMBO_DATALEN + \ 104 + ((N) - 1) * RXRPC_JUMBO_SUBPKTLEN) 100 105 101 106 /*****************************************************************************/ 102 107 /*
+4 -2
net/rxrpc/sysctl.c
··· 11 11 #include "ar-internal.h" 12 12 13 13 static struct ctl_table_header *rxrpc_sysctl_reg_table; 14 + static const unsigned int rxrpc_rx_mtu_min = 500; 15 + static const unsigned int rxrpc_jumbo_max = RXRPC_MAX_NR_JUMBO; 14 16 static const unsigned int four = 4; 15 17 static const unsigned int max_backlog = RXRPC_BACKLOG_MAX - 1; 16 18 static const unsigned int n_65535 = 65535; ··· 117 115 .maxlen = sizeof(unsigned int), 118 116 .mode = 0644, 119 117 .proc_handler = proc_dointvec_minmax, 120 - .extra1 = (void *)SYSCTL_ONE, 118 + .extra1 = (void *)&rxrpc_rx_mtu_min, 121 119 .extra2 = (void *)&n_65535, 122 120 }, 123 121 { ··· 127 125 .mode = 0644, 128 126 .proc_handler = proc_dointvec_minmax, 129 127 .extra1 = (void *)SYSCTL_ONE, 130 - .extra2 = (void *)&four, 128 + .extra2 = (void *)&rxrpc_jumbo_max, 131 129 }, 132 130 }; 133 131
+2 -1
net/rxrpc/txbuf.c
··· 179 179 trace_rxrpc_txbuf(txb->debug_id, txb->call_debug_id, txb->seq, 0, 180 180 rxrpc_txbuf_free); 181 181 for (i = 0; i < txb->nr_kvec; i++) 182 - if (txb->kvec[i].iov_base) 182 + if (txb->kvec[i].iov_base && 183 + !is_zero_pfn(page_to_pfn(virt_to_page(txb->kvec[i].iov_base)))) 183 184 page_frag_free(txb->kvec[i].iov_base); 184 185 kfree(txb); 185 186 atomic_dec(&rxrpc_nr_txbuf);