Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

net: ipv4: Cache pmtu for all packet paths if multipath enabled

Check number of paths by fib_info_num_path(),
and update_or_create_fnhe() for every path.
Problem is that pmtu is cached only for the oif
that has received icmp message "need to frag",
other oifs will still try to use "default" iface mtu.

An example topology showing the problem:

| host1
+---------+
| dummy0 | 10.179.20.18/32 mtu9000
+---------+
+-----------+----------------+
+---------+ +---------+
| ens17f0 | 10.179.2.141/31 | ens17f1 | 10.179.2.13/31
+---------+ +---------+
| (all here have mtu 9000) |
+------+ +------+
| ro1 | 10.179.2.140/31 | ro2 | 10.179.2.12/31
+------+ +------+
| |
---------+------------+-------------------+------
|
+-----+
| ro3 | 10.10.10.10 mtu1500
+-----+
|
========================================
some networks
========================================
|
+-----+
| eth0| 10.10.30.30 mtu9000
+-----+
| host2

host1 have enabled multipath and
sysctl net.ipv4.fib_multipath_hash_policy = 1:

default proto static src 10.179.20.18
nexthop via 10.179.2.12 dev ens17f1 weight 1
nexthop via 10.179.2.140 dev ens17f0 weight 1

When host1 tries to do pmtud from 10.179.20.18/32 to host2,
host1 receives at ens17f1 iface an icmp packet from ro3 that ro3 mtu=1500.
And host1 caches it in nexthop exceptions cache.

Problem is that it is cached only for the iface that has received icmp,
and there is no way that ro3 will send icmp msg to host1 via another path.

Host1 now have this routes to host2:

ip r g 10.10.30.30 sport 30000 dport 443
10.10.30.30 via 10.179.2.12 dev ens17f1 src 10.179.20.18 uid 0
cache expires 521sec mtu 1500

ip r g 10.10.30.30 sport 30033 dport 443
10.10.30.30 via 10.179.2.140 dev ens17f0 src 10.179.20.18 uid 0
cache

So when host1 tries again to reach host2 with mtu>1500,
if packet flow is lucky enough to be hashed with oif=ens17f1 its ok,
if oif=ens17f0 it blackholes and still gets icmp msgs from ro3 to ens17f1,
until lucky day when ro3 will send it through another flow to ens17f0.

Signed-off-by: Vladimir Vdovin <deliran@verdict.gg>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Link: https://patch.msgid.link/20241108093427.317942-1-deliran@verdict.gg
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

authored by

Vladimir Vdovin and committed by
Jakub Kicinski
7d3f3b43 43271bb5

+108 -17
+13
net/ipv4/route.c
··· 1027 1027 struct fib_nh_common *nhc; 1028 1028 1029 1029 fib_select_path(net, &res, fl4, NULL); 1030 + #ifdef CONFIG_IP_ROUTE_MULTIPATH 1031 + if (fib_info_num_path(res.fi) > 1) { 1032 + int nhsel; 1033 + 1034 + for (nhsel = 0; nhsel < fib_info_num_path(res.fi); nhsel++) { 1035 + nhc = fib_info_nhc(res.fi, nhsel); 1036 + update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock, 1037 + jiffies + net->ipv4.ip_rt_mtu_expires); 1038 + } 1039 + rcu_read_unlock(); 1040 + return; 1041 + } 1042 + #endif /* CONFIG_IP_ROUTE_MULTIPATH */ 1030 1043 nhc = FIB_RES_NHC(res); 1031 1044 update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock, 1032 1045 jiffies + net->ipv4.ip_rt_mtu_expires);
+95 -17
tools/testing/selftests/net/pmtu.sh
··· 197 197 # 198 198 # - pmtu_ipv6_route_change 199 199 # Same as above but with IPv6 200 + # 201 + # - pmtu_ipv4_mp_exceptions 202 + # Use the same topology as in pmtu_ipv4, but add routeable addresses 203 + # on host A and B on lo reachable via both routers. Host A and B 204 + # addresses have multipath routes to each other, b_r1 mtu = 1500. 205 + # Check that PMTU exceptions are created for both paths. 200 206 201 207 source lib.sh 202 208 source net_helper.sh ··· 272 266 list_flush_ipv4_exception ipv4: list and flush cached exceptions 1 273 267 list_flush_ipv6_exception ipv6: list and flush cached exceptions 1 274 268 pmtu_ipv4_route_change ipv4: PMTU exception w/route replace 1 275 - pmtu_ipv6_route_change ipv6: PMTU exception w/route replace 1" 269 + pmtu_ipv6_route_change ipv6: PMTU exception w/route replace 1 270 + pmtu_ipv4_mp_exceptions ipv4: PMTU multipath nh exceptions 1" 276 271 277 272 # Addressing and routing for tests with routers: four network segments, with 278 273 # index SEGMENT between 1 and 4, a common prefix (PREFIX4 or PREFIX6) and an ··· 349 342 tunnel6_a_addr="fd00:2::a" 350 343 tunnel6_b_addr="fd00:2::b" 351 344 tunnel6_mask="64" 345 + 346 + host4_a_addr="192.168.99.99" 347 + host4_b_addr="192.168.88.88" 352 348 353 349 dummy6_0_prefix="fc00:1000::" 354 350 dummy6_1_prefix="fc00:1001::" ··· 994 984 run_cmd ip route add ${prefix6}:${b_r1}::1 via ${prefix6}:${a_r1}::2 995 985 } 996 986 987 + setup_multipath_new() { 988 + # Set up host A with multipath routes to host B host4_b_addr 989 + run_cmd ${ns_a} ip addr add ${host4_a_addr} dev lo 990 + run_cmd ${ns_a} ip nexthop add id 401 via ${prefix4}.${a_r1}.2 dev veth_A-R1 991 + run_cmd ${ns_a} ip nexthop add id 402 via ${prefix4}.${a_r2}.2 dev veth_A-R2 992 + run_cmd ${ns_a} ip nexthop add id 403 group 401/402 993 + run_cmd ${ns_a} ip route add ${host4_b_addr} src ${host4_a_addr} nhid 403 994 + 995 + # Set up host B with multipath routes to host A host4_a_addr 996 + run_cmd ${ns_b} ip addr add ${host4_b_addr} dev lo 997 + run_cmd ${ns_b} ip nexthop add id 401 via ${prefix4}.${b_r1}.2 dev veth_B-R1 998 + run_cmd ${ns_b} ip nexthop add id 402 via ${prefix4}.${b_r2}.2 dev veth_B-R2 999 + run_cmd ${ns_b} ip nexthop add id 403 group 401/402 1000 + run_cmd ${ns_b} ip route add ${host4_a_addr} src ${host4_b_addr} nhid 403 1001 + } 1002 + 1003 + setup_multipath_old() { 1004 + # Set up host A with multipath routes to host B host4_b_addr 1005 + run_cmd ${ns_a} ip addr add ${host4_a_addr} dev lo 1006 + run_cmd ${ns_a} ip route add ${host4_b_addr} \ 1007 + src ${host4_a_addr} \ 1008 + nexthop via ${prefix4}.${a_r1}.2 weight 1 \ 1009 + nexthop via ${prefix4}.${a_r2}.2 weight 1 1010 + 1011 + # Set up host B with multipath routes to host A host4_a_addr 1012 + run_cmd ${ns_b} ip addr add ${host4_b_addr} dev lo 1013 + run_cmd ${ns_b} ip route add ${host4_a_addr} \ 1014 + src ${host4_b_addr} \ 1015 + nexthop via ${prefix4}.${b_r1}.2 weight 1 \ 1016 + nexthop via ${prefix4}.${b_r2}.2 weight 1 1017 + } 1018 + 1019 + setup_multipath() { 1020 + if [ "$USE_NH" = "yes" ]; then 1021 + setup_multipath_new 1022 + else 1023 + setup_multipath_old 1024 + fi 1025 + 1026 + # Set up routers with routes to dummies 1027 + run_cmd ${ns_r1} ip route add ${host4_a_addr} via ${prefix4}.${a_r1}.1 1028 + run_cmd ${ns_r2} ip route add ${host4_a_addr} via ${prefix4}.${a_r2}.1 1029 + run_cmd ${ns_r1} ip route add ${host4_b_addr} via ${prefix4}.${b_r1}.1 1030 + run_cmd ${ns_r2} ip route add ${host4_b_addr} via ${prefix4}.${b_r2}.1 1031 + } 1032 + 997 1033 setup() { 998 1034 [ "$(id -u)" -ne 0 ] && echo " need to run as root" && return $ksft_skip 999 1035 ··· 1132 1076 } 1133 1077 1134 1078 route_get_dst_exception() { 1135 - ns_cmd="${1}" 1136 - dst="${2}" 1137 - dsfield="${3}" 1079 + ns_cmd="${1}"; shift 1138 1080 1139 - if [ -z "${dsfield}" ]; then 1140 - dsfield=0 1141 - fi 1142 - 1143 - ${ns_cmd} ip route get "${dst}" dsfield "${dsfield}" 1081 + ${ns_cmd} ip route get "$@" 1144 1082 } 1145 1083 1146 1084 route_get_dst_pmtu_from_exception() { 1147 - ns_cmd="${1}" 1148 - dst="${2}" 1149 - dsfield="${3}" 1085 + ns_cmd="${1}"; shift 1150 1086 1151 - mtu_parse "$(route_get_dst_exception "${ns_cmd}" "${dst}" "${dsfield}")" 1087 + mtu_parse "$(route_get_dst_exception "${ns_cmd}" "$@")" 1152 1088 } 1153 1089 1154 1090 check_pmtu_value() { ··· 1283 1235 run_cmd "${ns_a}" ping -q -M want -Q "${dsfield}" -c 1 -w 1 -s "${len}" "${dst2}" 1284 1236 1285 1237 # Check that exceptions have been created with the correct PMTU 1286 - pmtu_1="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst1}" "${policy_mark}")" 1238 + pmtu_1="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst1}" dsfield "${policy_mark}")" 1287 1239 check_pmtu_value "1400" "${pmtu_1}" "exceeding MTU" || return 1 1288 1240 1289 - pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst2}" "${policy_mark}")" 1241 + pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst2}" dsfield "${policy_mark}")" 1290 1242 check_pmtu_value "1500" "${pmtu_2}" "exceeding MTU" || return 1 1291 1243 } 1292 1244 ··· 1333 1285 UDP:"${dst2}":50000,tos="${dsfield}" 1334 1286 1335 1287 # Check that exceptions have been created with the correct PMTU 1336 - pmtu_1="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst1}" "${policy_mark}")" 1288 + pmtu_1="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst1}" dsfield "${policy_mark}")" 1337 1289 check_pmtu_value "1400" "${pmtu_1}" "exceeding MTU" || return 1 1338 - pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst2}" "${policy_mark}")" 1290 + pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst2}" dsfield "${policy_mark}")" 1339 1291 check_pmtu_value "1500" "${pmtu_2}" "exceeding MTU" || return 1 1340 1292 } 1341 1293 ··· 2375 2327 2376 2328 test_pmtu_ipv6_route_change() { 2377 2329 test_pmtu_ipvX_route_change 6 2330 + } 2331 + 2332 + test_pmtu_ipv4_mp_exceptions() { 2333 + setup namespaces routing multipath || return $ksft_skip 2334 + 2335 + trace "${ns_a}" veth_A-R1 "${ns_r1}" veth_R1-A \ 2336 + "${ns_r1}" veth_R1-B "${ns_b}" veth_B-R1 \ 2337 + "${ns_a}" veth_A-R2 "${ns_r2}" veth_R2-A \ 2338 + "${ns_r2}" veth_R2-B "${ns_b}" veth_B-R2 2339 + 2340 + # Set up initial MTU values 2341 + mtu "${ns_a}" veth_A-R1 2000 2342 + mtu "${ns_r1}" veth_R1-A 2000 2343 + mtu "${ns_r1}" veth_R1-B 1500 2344 + mtu "${ns_b}" veth_B-R1 1500 2345 + 2346 + mtu "${ns_a}" veth_A-R2 2000 2347 + mtu "${ns_r2}" veth_R2-A 2000 2348 + mtu "${ns_r2}" veth_R2-B 1500 2349 + mtu "${ns_b}" veth_B-R2 1500 2350 + 2351 + # Ping and expect two nexthop exceptions for two routes 2352 + run_cmd ${ns_a} ping -q -M want -i 0.1 -c 1 -s 1800 "${host4_b_addr}" 2353 + 2354 + # Check that exceptions have been created with the correct PMTU 2355 + pmtu_a_R1="$(route_get_dst_pmtu_from_exception "${ns_a}" "${host4_b_addr}" oif veth_A-R1)" 2356 + pmtu_a_R2="$(route_get_dst_pmtu_from_exception "${ns_a}" "${host4_b_addr}" oif veth_A-R2)" 2357 + 2358 + check_pmtu_value "1500" "${pmtu_a_R1}" "exceeding MTU (veth_A-R1)" || return 1 2359 + check_pmtu_value "1500" "${pmtu_a_R2}" "exceeding MTU (veth_A-R2)" || return 1 2378 2360 } 2379 2361 2380 2362 usage() {