Merge branch 'ipv4-icmp-fix-source-ip-derivation-in-presence-of-vrfs'

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Ido Schimmel says:

====================
ipv4: icmp: Fix source IP derivation in presence of VRFs

Align IPv4 with IPv6 and in the presence of VRFs generate ICMP error
messages with a source IP that is derived from the receiving interface
and not from its VRF master. This is especially important when the error
messages are "Time Exceeded" messages as it means that utilities like
traceroute will show an incorrect packet path.

Patches #1-#2 are preparations.

Patch #3 is the actual change.

Patches #4-#7 make small improvements in the existing traceroute test.

Patch #8 extends the traceroute test with VRF test cases for both IPv4
and IPv6.

Changes since v1 [1]:
* Rebase.

[1] https://lore.kernel.org/netdev/20250901083027.183468-1-idosch@nvidia.com/
====================

Link: https://patch.msgid.link/20250908073238.119240-1-idosch@nvidia.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>

Paolo Abeni 7 months ago 5adf6f2b 7f0b763b

+229 -69

5 changed files

expand all

include

net

icmp.h

net

ipv4

cipso_ipv4.c

icmp.c

route.c

tools

testing

selftests

net

traceroute.sh

+6 -4

include/net/icmp.h

··· 37 37 struct net; 38 38 39 39 void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, 40 - const struct ip_options *opt); 40 + const struct inet_skb_parm *parm); 41 41 static inline void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) 42 42 { 43 - __icmp_send(skb_in, type, code, info, &IPCB(skb_in)->opt); 43 + __icmp_send(skb_in, type, code, info, IPCB(skb_in)); 44 44 } 45 45 46 46 #if IS_ENABLED(CONFIG_NF_NAT) ··· 48 48 #else 49 49 static inline void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info) 50 50 { 51 - struct ip_options opts = { 0 }; 52 - __icmp_send(skb_in, type, code, info, &opts); 51 + struct inet_skb_parm parm; 52 + 53 + memset(&parm, 0, sizeof(parm)); 54 + __icmp_send(skb_in, type, code, info, &parm); 53 55 } 54 56 #endif 55 57

+6 -7

net/ipv4/cipso_ipv4.c

··· 1715 1715 */ 1716 1716 void cipso_v4_error(struct sk_buff *skb, int error, u32 gateway) 1717 1717 { 1718 - unsigned char optbuf[sizeof(struct ip_options) + 40]; 1719 - struct ip_options *opt = (struct ip_options *)optbuf; 1718 + struct inet_skb_parm parm; 1720 1719 int res; 1721 1720 1722 1721 if (ip_hdr(skb)->protocol == IPPROTO_ICMP || error != -EACCES) ··· 1726 1727 * so we can not use icmp_send and IPCB here. 1727 1728 */ 1728 1729 1729 - memset(opt, 0, sizeof(struct ip_options)); 1730 - opt->optlen = ip_hdr(skb)->ihl*4 - sizeof(struct iphdr); 1730 + memset(&parm, 0, sizeof(parm)); 1731 + parm.opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr); 1731 1732 rcu_read_lock(); 1732 - res = __ip_options_compile(dev_net(skb->dev), opt, skb, NULL); 1733 + res = __ip_options_compile(dev_net(skb->dev), &parm.opt, skb, NULL); 1733 1734 rcu_read_unlock(); 1734 1735 1735 1736 if (res) 1736 1737 return; 1737 1738 1738 1739 if (gateway) 1739 - __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_NET_ANO, 0, opt); 1740 + __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_NET_ANO, 0, &parm); 1740 1741 else 1741 - __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_ANO, 0, opt); 1742 + __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_ANO, 0, &parm); 1742 1743 } 1743 1744 1744 1745 /**

+9 -6

net/ipv4/icmp.c

··· 594 594 */ 595 595 596 596 void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, 597 - const struct ip_options *opt) 597 + const struct inet_skb_parm *parm) 598 598 { 599 599 struct iphdr *iph; 600 600 int room; ··· 710 710 rcu_read_lock(); 711 711 if (rt_is_input_route(rt) && 712 712 READ_ONCE(net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr)) 713 - dev = dev_get_by_index_rcu(net, inet_iif(skb_in)); 713 + dev = dev_get_by_index_rcu(net, parm->iif ? parm->iif : 714 + inet_iif(skb_in)); 714 715 715 716 if (dev) 716 717 saddr = inet_select_addr(dev, iph->saddr, ··· 726 725 iph->tos; 727 726 mark = IP4_REPLY_MARK(net, skb_in->mark); 728 727 729 - if (__ip_options_echo(net, &icmp_param.replyopts.opt.opt, skb_in, opt)) 728 + if (__ip_options_echo(net, &icmp_param.replyopts.opt.opt, skb_in, 729 + &parm->opt)) 730 730 goto out_unlock; 731 731 732 732 ··· 801 799 void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info) 802 800 { 803 801 struct sk_buff *cloned_skb = NULL; 804 - struct ip_options opts = { 0 }; 805 802 enum ip_conntrack_info ctinfo; 806 803 enum ip_conntrack_dir dir; 804 + struct inet_skb_parm parm; 807 805 struct nf_conn *ct; 808 806 __be32 orig_ip; 809 807 808 + memset(&parm, 0, sizeof(parm)); 810 809 ct = nf_ct_get(skb_in, &ctinfo); 811 810 if (!ct || !(READ_ONCE(ct->status) & IPS_NAT_MASK)) { 812 - __icmp_send(skb_in, type, code, info, &opts); 811 + __icmp_send(skb_in, type, code, info, &parm); 813 812 return; 814 813 } 815 814 ··· 826 823 orig_ip = ip_hdr(skb_in)->saddr; 827 824 dir = CTINFO2DIR(ctinfo); 828 825 ip_hdr(skb_in)->saddr = ct->tuplehash[dir].tuple.src.u3.ip; 829 - __icmp_send(skb_in, type, code, info, &opts); 826 + __icmp_send(skb_in, type, code, info, &parm); 830 827 ip_hdr(skb_in)->saddr = orig_ip; 831 828 out: 832 829 consume_skb(cloned_skb);

+5 -5

net/ipv4/route.c

··· 1222 1222 1223 1223 static void ipv4_send_dest_unreach(struct sk_buff *skb) 1224 1224 { 1225 + struct inet_skb_parm parm; 1225 1226 struct net_device *dev; 1226 - struct ip_options opt; 1227 1227 int res; 1228 1228 1229 1229 /* Recompile ip options since IPCB may not be valid anymore. ··· 1233 1233 ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5) 1234 1234 return; 1235 1235 1236 - memset(&opt, 0, sizeof(opt)); 1236 + memset(&parm, 0, sizeof(parm)); 1237 1237 if (ip_hdr(skb)->ihl > 5) { 1238 1238 if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4)) 1239 1239 return; 1240 - opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr); 1240 + parm.opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr); 1241 1241 1242 1242 rcu_read_lock(); 1243 1243 dev = skb->dev ? skb->dev : skb_rtable(skb)->dst.dev; 1244 - res = __ip_options_compile(dev_net(dev), &opt, skb, NULL); 1244 + res = __ip_options_compile(dev_net(dev), &parm.opt, skb, NULL); 1245 1245 rcu_read_unlock(); 1246 1246 1247 1247 if (res) 1248 1248 return; 1249 1249 } 1250 - __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt); 1250 + __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &parm); 1251 1251 } 1252 1252 1253 1253 static void ipv4_link_failure(struct sk_buff *skb)

+203 -47

tools/testing/selftests/net/traceroute.sh

··· 10 10 11 11 ################################################################################ 12 12 # 13 - log_test() 14 - { 15 - local rc=$1 16 - local expected=$2 17 - local msg="$3" 18 - 19 - if [ ${rc} -eq ${expected} ]; then 20 - printf "TEST: %-60s [ OK ]\n" "${msg}" 21 - nsuccess=$((nsuccess+1)) 22 - else 23 - ret=1 24 - nfail=$((nfail+1)) 25 - printf "TEST: %-60s [FAIL]\n" "${msg}" 26 - if [ "${PAUSE_ON_FAIL}" = "yes" ]; then 27 - echo 28 - echo "hit enter to continue, 'q' to quit" 29 - read a 30 - [ "$a" = "q" ] && exit 1 31 - fi 32 - fi 33 - } 34 - 35 13 run_cmd() 36 14 { 37 15 local ns ··· 181 203 182 204 run_traceroute6() 183 205 { 184 - if [ ! -x "$(command -v traceroute6)" ]; then 185 - echo "SKIP: Could not run IPV6 test without traceroute6" 186 - return 187 - fi 188 - 189 206 setup_traceroute6 207 + 208 + RET=0 190 209 191 210 # traceroute6 host-2 from host-1 (expects 2000:102::2) 192 211 run_cmd $h1 "traceroute6 2000:103::4 | grep -q 2000:102::2" 193 - log_test $? 0 "IPV6 traceroute" 212 + check_err $? "traceroute6 did not return 2000:102::2" 213 + log_test "IPv6 traceroute" 194 214 195 215 cleanup_traceroute6 196 216 } 197 217 198 218 ################################################################################ 219 + # traceroute6 with VRF test 220 + # 221 + # Verify that in this scenario 222 + # 223 + # ------------------------ N2 224 + # | | 225 + # ------ ------ N3 ---- 226 + # | R1 | | R2 |------|H2| 227 + # ------ ------ ---- 228 + # | | 229 + # ------------------------ N1 230 + # | 231 + # ---- 232 + # |H1| 233 + # ---- 234 + # 235 + # Where H1's default route goes through R1 and R1's default route goes through 236 + # R2 over N2, traceroute6 from H1 to H2 reports R2's address on N2 and not N1. 237 + # The interfaces connecting R2 to the different subnets are membmer in a VRF 238 + # and the intention is to check that traceroute6 does not report the VRF's 239 + # address. 240 + # 241 + # Addresses are assigned as follows: 242 + # 243 + # N1: 2000:101::/64 244 + # N2: 2000:102::/64 245 + # N3: 2000:103::/64 246 + # 247 + # R1's host part of address: 1 248 + # R2's host part of address: 2 249 + # H1's host part of address: 3 250 + # H2's host part of address: 4 251 + # 252 + # For example: 253 + # the IPv6 address of R1's interface on N2 is 2000:102::1/64 254 + 255 + cleanup_traceroute6_vrf() 256 + { 257 + cleanup_all_ns 258 + } 259 + 260 + setup_traceroute6_vrf() 261 + { 262 + # Start clean 263 + cleanup_traceroute6_vrf 264 + 265 + setup_ns h1 h2 r1 r2 266 + create_ns "$h1" 267 + create_ns "$h2" 268 + create_ns "$r1" 269 + create_ns "$r2" 270 + 271 + ip -n "$r2" link add name vrf100 up type vrf table 100 272 + ip -n "$r2" addr add 2001:db8:100::1/64 dev vrf100 273 + 274 + # Setup N3 275 + connect_ns "$r2" eth3 - 2000:103::2/64 "$h2" eth3 - 2000:103::4/64 276 + 277 + ip -n "$r2" link set dev eth3 master vrf100 278 + 279 + ip -n "$h2" route add default via 2000:103::2 280 + 281 + # Setup N2 282 + connect_ns "$r1" eth2 - 2000:102::1/64 "$r2" eth2 - 2000:102::2/64 283 + 284 + ip -n "$r1" route add default via 2000:102::2 285 + 286 + ip -n "$r2" link set dev eth2 master vrf100 287 + 288 + # Setup N1. host-1 and router-2 connect to a bridge in router-1. 289 + ip -n "$r1" link add name br100 up type bridge 290 + ip -n "$r1" addr add 2000:101::1/64 dev br100 291 + 292 + connect_ns "$h1" eth0 - 2000:101::3/64 "$r1" eth0 - - 293 + 294 + ip -n "$h1" route add default via 2000:101::1 295 + 296 + ip -n "$r1" link set dev eth0 master br100 297 + 298 + connect_ns "$r2" eth1 - 2000:101::2/64 "$r1" eth1 - - 299 + 300 + ip -n "$r2" link set dev eth1 master vrf100 301 + 302 + ip -n "$r1" link set dev eth1 master br100 303 + 304 + # Prime the network 305 + ip netns exec "$h1" ping6 -c5 2000:103::4 >/dev/null 2>&1 306 + } 307 + 308 + run_traceroute6_vrf() 309 + { 310 + setup_traceroute6_vrf 311 + 312 + RET=0 313 + 314 + # traceroute6 host-2 from host-1 (expects 2000:102::2) 315 + run_cmd "$h1" "traceroute6 2000:103::4 | grep 2000:102::2" 316 + check_err $? "traceroute6 did not return 2000:102::2" 317 + log_test "IPv6 traceroute with VRF" 318 + 319 + cleanup_traceroute6_vrf 320 + } 321 + 322 + ################################################################################ 199 323 # traceroute test 200 324 # 201 - # Verify that traceroute from H1 to H2 shows 1.0.1.1 in this scenario 325 + # Verify that traceroute from H1 to H2 shows 1.0.3.1 and 1.0.1.1 when 326 + # traceroute uses 1.0.3.3 and 1.0.1.3 as the source IP, respectively. 202 327 # 203 - # 1.0.3.1/24 328 + # 1.0.3.3/24 1.0.3.1/24 204 329 # ---- 1.0.1.3/24 1.0.1.1/24 ---- 1.0.2.1/24 1.0.2.4/24 ---- 205 330 # |H1|--------------------------|R1|--------------------------|H2| 206 331 # ---- N1 ---- N2 ---- 207 332 # 208 - # where net.ipv4.icmp_errors_use_inbound_ifaddr is set on R1 and 209 - # 1.0.3.1/24 and 1.0.1.1/24 are respectively R1's primary and secondary 210 - # address on N1. 211 - # 333 + # where net.ipv4.icmp_errors_use_inbound_ifaddr is set on R1 and 1.0.3.1/24 and 334 + # 1.0.1.1/24 are R1's primary addresses on N1. The kernel is expected to prefer 335 + # a source address that is on the same subnet as the destination IP of the ICMP 336 + # error message. 212 337 213 338 cleanup_traceroute() 214 339 { ··· 331 250 332 251 connect_ns $h1 eth0 1.0.1.3/24 - \ 333 252 $router eth1 1.0.3.1/24 - 253 + ip -n "$h1" addr add 1.0.3.3/24 dev eth0 334 254 ip netns exec $h1 ip route add default via 1.0.1.1 335 255 336 256 ip netns exec $router ip addr add 1.0.1.1/24 dev eth1 ··· 350 268 351 269 run_traceroute() 352 270 { 353 - if [ ! -x "$(command -v traceroute)" ]; then 354 - echo "SKIP: Could not run IPV4 test without traceroute" 355 - return 356 - fi 357 - 358 271 setup_traceroute 359 272 360 - # traceroute host-2 from host-1 (expects 1.0.1.1). Takes a while. 361 - run_cmd $h1 "traceroute 1.0.2.4 | grep -q 1.0.1.1" 362 - log_test $? 0 "IPV4 traceroute" 273 + RET=0 274 + 275 + # traceroute host-2 from host-1. Expect a source IP that is on the same 276 + # subnet as destination IP of the ICMP error message. 277 + run_cmd "$h1" "traceroute -s 1.0.1.3 1.0.2.4 | grep -q 1.0.1.1" 278 + check_err $? "traceroute did not return 1.0.1.1" 279 + run_cmd "$h1" "traceroute -s 1.0.3.3 1.0.2.4 | grep -q 1.0.3.1" 280 + check_err $? "traceroute did not return 1.0.3.1" 281 + log_test "IPv4 traceroute" 363 282 364 283 cleanup_traceroute 284 + } 285 + 286 + ################################################################################ 287 + # traceroute with VRF test 288 + # 289 + # Verify that traceroute from H1 to H2 shows 1.0.3.1 and 1.0.1.1 when 290 + # traceroute uses 1.0.3.3 and 1.0.1.3 as the source IP, respectively. The 291 + # intention is to check that the kernel does not choose an IP assigned to the 292 + # VRF device, but rather an address from the VRF port (eth1) that received the 293 + # packet that generates the ICMP error message. 294 + # 295 + # 1.0.4.1/24 (vrf100) 296 + # 1.0.3.3/24 1.0.3.1/24 297 + # ---- 1.0.1.3/24 1.0.1.1/24 ---- 1.0.2.1/24 1.0.2.4/24 ---- 298 + # |H1|--------------------------|R1|--------------------------|H2| 299 + # ---- N1 ---- N2 ---- 300 + 301 + cleanup_traceroute_vrf() 302 + { 303 + cleanup_all_ns 304 + } 305 + 306 + setup_traceroute_vrf() 307 + { 308 + # Start clean 309 + cleanup_traceroute_vrf 310 + 311 + setup_ns h1 h2 router 312 + create_ns "$h1" 313 + create_ns "$h2" 314 + create_ns "$router" 315 + 316 + ip -n "$router" link add name vrf100 up type vrf table 100 317 + ip -n "$router" addr add 1.0.4.1/24 dev vrf100 318 + 319 + connect_ns "$h1" eth0 1.0.1.3/24 - \ 320 + "$router" eth1 1.0.1.1/24 - 321 + 322 + ip -n "$h1" addr add 1.0.3.3/24 dev eth0 323 + ip -n "$h1" route add default via 1.0.1.1 324 + 325 + ip -n "$router" link set dev eth1 master vrf100 326 + ip -n "$router" addr add 1.0.3.1/24 dev eth1 327 + ip netns exec "$router" sysctl -qw \ 328 + net.ipv4.icmp_errors_use_inbound_ifaddr=1 329 + 330 + connect_ns "$h2" eth0 1.0.2.4/24 - \ 331 + "$router" eth2 1.0.2.1/24 - 332 + 333 + ip -n "$h2" route add default via 1.0.2.1 334 + 335 + ip -n "$router" link set dev eth2 master vrf100 336 + 337 + # Prime the network 338 + ip netns exec "$h1" ping -c5 1.0.2.4 >/dev/null 2>&1 339 + } 340 + 341 + run_traceroute_vrf() 342 + { 343 + setup_traceroute_vrf 344 + 345 + RET=0 346 + 347 + # traceroute host-2 from host-1. Expect a source IP that is on the same 348 + # subnet as destination IP of the ICMP error message. 349 + run_cmd "$h1" "traceroute -s 1.0.1.3 1.0.2.4 | grep 1.0.1.1" 350 + check_err $? "traceroute did not return 1.0.1.1" 351 + run_cmd "$h1" "traceroute -s 1.0.3.3 1.0.2.4 | grep 1.0.3.1" 352 + check_err $? "traceroute did not return 1.0.3.1" 353 + log_test "IPv4 traceroute with VRF" 354 + 355 + cleanup_traceroute_vrf 365 356 } 366 357 367 358 ################################################################################ ··· 443 288 run_tests() 444 289 { 445 290 run_traceroute6 291 + run_traceroute6_vrf 446 292 run_traceroute 293 + run_traceroute_vrf 447 294 } 448 295 449 296 ################################################################################ 450 297 # main 451 - 452 - declare -i nfail=0 453 - declare -i nsuccess=0 454 298 455 299 while getopts :pv o 456 300 do ··· 460 306 esac 461 307 done 462 308 309 + require_command traceroute6 310 + require_command traceroute 311 + 463 312 run_tests 464 313 465 - printf "\nTests passed: %3d\n" ${nsuccess} 466 - printf "Tests failed: %3d\n" ${nfail} 314 + exit "${EXIT_STATUS}"