Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'ipv4-handle-tos-and-scope-properly-for-icmp-redirects-and-pmtu-updates'

Guillaume Nault says:

====================
ipv4: Handle TOS and scope properly for ICMP redirects and PMTU updates

ICMPv4 PMTU and redirect handlers didn't properly initialise the
struct flowi4 they used for route lookups:

* ECN bits sometimes weren't cleared from ->flowi4_tos.
* The RTO_ONLINK flag wasn't taken into account for ->flowi4_scope.

In some special cases, this resulted in ICMP redirects and PMTU updates
not being taken into account because fib_lookup() couldn't retrieve the
correct route.
====================

Link: https://lore.kernel.org/r/cover.1647519748.git.gnault@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

+151 -8
+14 -4
net/ipv4/route.c
··· 499 499 } 500 500 EXPORT_SYMBOL(__ip_select_ident); 501 501 502 + static void ip_rt_fix_tos(struct flowi4 *fl4) 503 + { 504 + __u8 tos = RT_FL_TOS(fl4); 505 + 506 + fl4->flowi4_tos = tos & IPTOS_RT_MASK; 507 + fl4->flowi4_scope = tos & RTO_ONLINK ? 508 + RT_SCOPE_LINK : RT_SCOPE_UNIVERSE; 509 + } 510 + 502 511 static void __build_flow_key(const struct net *net, struct flowi4 *fl4, 503 512 const struct sock *sk, 504 513 const struct iphdr *iph, ··· 833 824 rt = (struct rtable *) dst; 834 825 835 826 __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0); 827 + ip_rt_fix_tos(&fl4); 836 828 __ip_do_redirect(rt, skb, &fl4, true); 837 829 } 838 830 ··· 1058 1048 struct flowi4 fl4; 1059 1049 1060 1050 ip_rt_build_flow_key(&fl4, sk, skb); 1051 + ip_rt_fix_tos(&fl4); 1061 1052 1062 1053 /* Don't make lookup fail for bridged encapsulations */ 1063 1054 if (skb && netif_is_any_bridge_port(skb->dev)) ··· 1133 1122 goto out; 1134 1123 1135 1124 new = true; 1125 + } else { 1126 + ip_rt_fix_tos(&fl4); 1136 1127 } 1137 1128 1138 1129 __ip_rt_update_pmtu((struct rtable *)xfrm_dst_path(&rt->dst), &fl4, mtu); ··· 2616 2603 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, 2617 2604 const struct sk_buff *skb) 2618 2605 { 2619 - __u8 tos = RT_FL_TOS(fl4); 2620 2606 struct fib_result res = { 2621 2607 .type = RTN_UNSPEC, 2622 2608 .fi = NULL, ··· 2625 2613 struct rtable *rth; 2626 2614 2627 2615 fl4->flowi4_iif = LOOPBACK_IFINDEX; 2628 - fl4->flowi4_tos = tos & IPTOS_RT_MASK; 2629 - fl4->flowi4_scope = ((tos & RTO_ONLINK) ? 2630 - RT_SCOPE_LINK : RT_SCOPE_UNIVERSE); 2616 + ip_rt_fix_tos(fl4); 2631 2617 2632 2618 rcu_read_lock(); 2633 2619 rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
+137 -4
tools/testing/selftests/net/pmtu.sh
··· 26 26 # - pmtu_ipv6 27 27 # Same as pmtu_ipv4, except for locked PMTU tests, using IPv6 28 28 # 29 + # - pmtu_ipv4_dscp_icmp_exception 30 + # Set up the same network topology as pmtu_ipv4, but use non-default 31 + # routing table in A. A fib-rule is used to jump to this routing table 32 + # based on DSCP. Send ICMPv4 packets with the expected DSCP value and 33 + # verify that ECN doesn't interfere with the creation of PMTU exceptions. 34 + # 35 + # - pmtu_ipv4_dscp_udp_exception 36 + # Same as pmtu_ipv4_dscp_icmp_exception, but use UDP instead of ICMP. 37 + # 29 38 # - pmtu_ipv4_vxlan4_exception 30 39 # Set up the same network topology as pmtu_ipv4, create a VXLAN tunnel 31 40 # over IPv4 between A and B, routed via R1. On the link between R1 and B, ··· 212 203 tests=" 213 204 pmtu_ipv4_exception ipv4: PMTU exceptions 1 214 205 pmtu_ipv6_exception ipv6: PMTU exceptions 1 206 + pmtu_ipv4_dscp_icmp_exception ICMPv4 with DSCP and ECN: PMTU exceptions 1 207 + pmtu_ipv4_dscp_udp_exception UDPv4 with DSCP and ECN: PMTU exceptions 1 215 208 pmtu_ipv4_vxlan4_exception IPv4 over vxlan4: PMTU exceptions 1 216 209 pmtu_ipv6_vxlan4_exception IPv6 over vxlan4: PMTU exceptions 1 217 210 pmtu_ipv4_vxlan6_exception IPv4 over vxlan6: PMTU exceptions 1 ··· 334 323 B 6 default 61 335 324 " 336 325 326 + policy_mark=0x04 327 + rt_table=main 328 + 337 329 veth4_a_addr="192.168.1.1" 338 330 veth4_b_addr="192.168.1.2" 339 331 veth4_c_addr="192.168.2.10" ··· 360 346 err_buf= 361 347 tcpdump_pids= 362 348 nettest_pids= 349 + socat_pids= 363 350 364 351 err() { 365 352 err_buf="${err_buf}${1} ··· 738 723 739 724 ns_name="$(nsname ${ns})" 740 725 741 - ip -n ${ns_name} route add ${addr} via ${gw} 726 + ip -n "${ns_name}" route add "${addr}" table "${rt_table}" via "${gw}" 742 727 743 728 ns=""; addr=""; gw="" 744 729 done ··· 768 753 769 754 ns_name="$(nsname ${ns})" 770 755 771 - ip -n ${ns_name} -${fam} route add ${addr} nhid ${nhid} 756 + ip -n "${ns_name}" -"${fam}" route add "${addr}" table "${rt_table}" nhid "${nhid}" 772 757 773 758 ns=""; fam=""; addr=""; nhid="" 774 759 done ··· 811 796 fi 812 797 813 798 return 0 799 + } 800 + 801 + setup_policy_routing() { 802 + setup_routing 803 + 804 + ip -netns "${NS_A}" -4 rule add dsfield "${policy_mark}" \ 805 + table "${rt_table}" 806 + 807 + # Set the IPv4 Don't Fragment bit with tc, since socat doesn't seem to 808 + # have an option do to it. 809 + tc -netns "${NS_A}" qdisc replace dev veth_A-R1 root prio 810 + tc -netns "${NS_A}" qdisc replace dev veth_A-R2 root prio 811 + tc -netns "${NS_A}" filter add dev veth_A-R1 \ 812 + protocol ipv4 flower ip_proto udp \ 813 + action pedit ex munge ip df set 0x40 pipe csum ip and udp 814 + tc -netns "${NS_A}" filter add dev veth_A-R2 \ 815 + protocol ipv4 flower ip_proto udp \ 816 + action pedit ex munge ip df set 0x40 pipe csum ip and udp 814 817 } 815 818 816 819 setup_bridge() { ··· 936 903 done 937 904 nettest_pids= 938 905 906 + for pid in ${socat_pids}; do 907 + kill "${pid}" 908 + done 909 + socat_pids= 910 + 939 911 for n in ${NS_A} ${NS_B} ${NS_C} ${NS_R1} ${NS_R2}; do 940 912 ip netns del ${n} 2> /dev/null 941 913 done ··· 988 950 route_get_dst_exception() { 989 951 ns_cmd="${1}" 990 952 dst="${2}" 953 + dsfield="${3}" 991 954 992 - ${ns_cmd} ip route get "${dst}" 955 + if [ -z "${dsfield}" ]; then 956 + dsfield=0 957 + fi 958 + 959 + ${ns_cmd} ip route get "${dst}" dsfield "${dsfield}" 993 960 } 994 961 995 962 route_get_dst_pmtu_from_exception() { 996 963 ns_cmd="${1}" 997 964 dst="${2}" 965 + dsfield="${3}" 998 966 999 - mtu_parse "$(route_get_dst_exception "${ns_cmd}" ${dst})" 967 + mtu_parse "$(route_get_dst_exception "${ns_cmd}" "${dst}" "${dsfield}")" 1000 968 } 1001 969 1002 970 check_pmtu_value() { ··· 1110 1066 1111 1067 test_pmtu_ipv6_exception() { 1112 1068 test_pmtu_ipvX 6 1069 + } 1070 + 1071 + test_pmtu_ipv4_dscp_icmp_exception() { 1072 + rt_table=100 1073 + 1074 + setup namespaces policy_routing || return $ksft_skip 1075 + trace "${ns_a}" veth_A-R1 "${ns_r1}" veth_R1-A \ 1076 + "${ns_r1}" veth_R1-B "${ns_b}" veth_B-R1 \ 1077 + "${ns_a}" veth_A-R2 "${ns_r2}" veth_R2-A \ 1078 + "${ns_r2}" veth_R2-B "${ns_b}" veth_B-R2 1079 + 1080 + # Set up initial MTU values 1081 + mtu "${ns_a}" veth_A-R1 2000 1082 + mtu "${ns_r1}" veth_R1-A 2000 1083 + mtu "${ns_r1}" veth_R1-B 1400 1084 + mtu "${ns_b}" veth_B-R1 1400 1085 + 1086 + mtu "${ns_a}" veth_A-R2 2000 1087 + mtu "${ns_r2}" veth_R2-A 2000 1088 + mtu "${ns_r2}" veth_R2-B 1500 1089 + mtu "${ns_b}" veth_B-R2 1500 1090 + 1091 + len=$((2000 - 20 - 8)) # Fills MTU of veth_A-R1 1092 + 1093 + dst1="${prefix4}.${b_r1}.1" 1094 + dst2="${prefix4}.${b_r2}.1" 1095 + 1096 + # Create route exceptions 1097 + dsfield=${policy_mark} # No ECN bit set (Not-ECT) 1098 + run_cmd "${ns_a}" ping -q -M want -Q "${dsfield}" -c 1 -w 1 -s "${len}" "${dst1}" 1099 + 1100 + dsfield=$(printf "%#x" $((policy_mark + 0x02))) # ECN=2 (ECT(0)) 1101 + run_cmd "${ns_a}" ping -q -M want -Q "${dsfield}" -c 1 -w 1 -s "${len}" "${dst2}" 1102 + 1103 + # Check that exceptions have been created with the correct PMTU 1104 + pmtu_1="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst1}" "${policy_mark}")" 1105 + check_pmtu_value "1400" "${pmtu_1}" "exceeding MTU" || return 1 1106 + 1107 + pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst2}" "${policy_mark}")" 1108 + check_pmtu_value "1500" "${pmtu_2}" "exceeding MTU" || return 1 1109 + } 1110 + 1111 + test_pmtu_ipv4_dscp_udp_exception() { 1112 + rt_table=100 1113 + 1114 + if ! which socat > /dev/null 2>&1; then 1115 + echo "'socat' command not found; skipping tests" 1116 + return $ksft_skip 1117 + fi 1118 + 1119 + setup namespaces policy_routing || return $ksft_skip 1120 + trace "${ns_a}" veth_A-R1 "${ns_r1}" veth_R1-A \ 1121 + "${ns_r1}" veth_R1-B "${ns_b}" veth_B-R1 \ 1122 + "${ns_a}" veth_A-R2 "${ns_r2}" veth_R2-A \ 1123 + "${ns_r2}" veth_R2-B "${ns_b}" veth_B-R2 1124 + 1125 + # Set up initial MTU values 1126 + mtu "${ns_a}" veth_A-R1 2000 1127 + mtu "${ns_r1}" veth_R1-A 2000 1128 + mtu "${ns_r1}" veth_R1-B 1400 1129 + mtu "${ns_b}" veth_B-R1 1400 1130 + 1131 + mtu "${ns_a}" veth_A-R2 2000 1132 + mtu "${ns_r2}" veth_R2-A 2000 1133 + mtu "${ns_r2}" veth_R2-B 1500 1134 + mtu "${ns_b}" veth_B-R2 1500 1135 + 1136 + len=$((2000 - 20 - 8)) # Fills MTU of veth_A-R1 1137 + 1138 + dst1="${prefix4}.${b_r1}.1" 1139 + dst2="${prefix4}.${b_r2}.1" 1140 + 1141 + # Create route exceptions 1142 + run_cmd_bg "${ns_b}" socat UDP-LISTEN:50000 OPEN:/dev/null,wronly=1 1143 + socat_pids="${socat_pids} $!" 1144 + 1145 + dsfield=${policy_mark} # No ECN bit set (Not-ECT) 1146 + run_cmd "${ns_a}" socat OPEN:/dev/zero,rdonly=1,readbytes="${len}" \ 1147 + UDP:"${dst1}":50000,tos="${dsfield}" 1148 + 1149 + dsfield=$(printf "%#x" $((policy_mark + 0x02))) # ECN=2 (ECT(0)) 1150 + run_cmd "${ns_a}" socat OPEN:/dev/zero,rdonly=1,readbytes="${len}" \ 1151 + UDP:"${dst2}":50000,tos="${dsfield}" 1152 + 1153 + # Check that exceptions have been created with the correct PMTU 1154 + pmtu_1="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst1}" "${policy_mark}")" 1155 + check_pmtu_value "1400" "${pmtu_1}" "exceeding MTU" || return 1 1156 + pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst2}" "${policy_mark}")" 1157 + check_pmtu_value "1500" "${pmtu_2}" "exceeding MTU" || return 1 1113 1158 } 1114 1159 1115 1160 test_pmtu_ipvX_over_vxlanY_or_geneveY_exception() {