Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

net: ipv4 sysctl option to ignore routes when nexthop link is down

This feature is only enabled with the new per-interface or ipv4 global
sysctls called 'ignore_routes_with_linkdown'.

net.ipv4.conf.all.ignore_routes_with_linkdown = 0
net.ipv4.conf.default.ignore_routes_with_linkdown = 0
net.ipv4.conf.lo.ignore_routes_with_linkdown = 0
...

When the above sysctls are set, will report to userspace that a route is
dead and will no longer resolve to this nexthop when performing a fib
lookup. This will signal to userspace that the route will not be
selected. The signalling of a RTNH_F_DEAD is only passed to userspace
if the sysctl is enabled and link is down. This was done as without it
the netlink listeners would have no idea whether or not a nexthop would
be selected. The kernel only sets RTNH_F_DEAD internally if the
interface has IFF_UP cleared.

With the new sysctl set, the following behavior can be observed
(interface p8p1 is link-down):

default via 10.0.5.2 dev p9p1
10.0.5.0/24 dev p9p1 proto kernel scope link src 10.0.5.15
70.0.0.0/24 dev p7p1 proto kernel scope link src 70.0.0.1
80.0.0.0/24 dev p8p1 proto kernel scope link src 80.0.0.1 dead linkdown
90.0.0.0/24 via 80.0.0.2 dev p8p1 metric 1 dead linkdown
90.0.0.0/24 via 70.0.0.2 dev p7p1 metric 2
90.0.0.1 via 70.0.0.2 dev p7p1 src 70.0.0.1
cache
local 80.0.0.1 dev lo src 80.0.0.1
cache <local>
80.0.0.2 via 10.0.5.2 dev p9p1 src 10.0.5.15
cache

While the route does remain in the table (so it can be modified if
needed rather than being wiped away as it would be if IFF_UP was
cleared), the proper next-hop is chosen automatically when the link is
down. Now interface p8p1 is linked-up:

default via 10.0.5.2 dev p9p1
10.0.5.0/24 dev p9p1 proto kernel scope link src 10.0.5.15
70.0.0.0/24 dev p7p1 proto kernel scope link src 70.0.0.1
80.0.0.0/24 dev p8p1 proto kernel scope link src 80.0.0.1
90.0.0.0/24 via 80.0.0.2 dev p8p1 metric 1
90.0.0.0/24 via 70.0.0.2 dev p7p1 metric 2
192.168.56.0/24 dev p2p1 proto kernel scope link src 192.168.56.2
90.0.0.1 via 80.0.0.2 dev p8p1 src 80.0.0.1
cache
local 80.0.0.1 dev lo src 80.0.0.1
cache <local>
80.0.0.2 dev p8p1 src 80.0.0.1
cache

and the output changes to what one would expect.

If the sysctl is not set, the following output would be expected when
p8p1 is down:

default via 10.0.5.2 dev p9p1
10.0.5.0/24 dev p9p1 proto kernel scope link src 10.0.5.15
70.0.0.0/24 dev p7p1 proto kernel scope link src 70.0.0.1
80.0.0.0/24 dev p8p1 proto kernel scope link src 80.0.0.1 linkdown
90.0.0.0/24 via 80.0.0.2 dev p8p1 metric 1 linkdown
90.0.0.0/24 via 70.0.0.2 dev p7p1 metric 2

Since the dead flag does not appear, there should be no expectation that
the kernel would skip using this route due to link being down.

v2: Split kernel changes into 2 patches, this actually makes a
behavioral change if the sysctl is set. Also took suggestion from Alex
to simplify code by only checking sysctl during fib lookup and
suggestion from Scott to add a per-interface sysctl.

v3: Code clean-ups to make it more readable and efficient as well as a
reverse path check fix.

v4: Drop binary sysctl

v5: Whitespace fixups from Dave

v6: Style changes from Dave and checkpatch suggestions

v7: One more checkpatch fixup

Signed-off-by: Andy Gospodarek <gospo@cumulusnetworks.com>
Signed-off-by: Dinesh Dutt <ddutt@cumulusnetworks.com>
Acked-by: Scott Feldman <sfeldma@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

Andy Gospodarek and committed by
David S. Miller
0eeb075f 8a3d0316

+63 -24
+3
include/linux/inetdevice.h
··· 120 120 || (!IN_DEV_FORWARD(in_dev) && \ 121 121 IN_DEV_ORCONF((in_dev), ACCEPT_REDIRECTS))) 122 122 123 + #define IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) \ 124 + IN_DEV_CONF_GET((in_dev), IGNORE_ROUTES_WITH_LINKDOWN) 125 + 123 126 #define IN_DEV_ARPFILTER(in_dev) IN_DEV_ORCONF((in_dev), ARPFILTER) 124 127 #define IN_DEV_ARP_ACCEPT(in_dev) IN_DEV_ORCONF((in_dev), ARP_ACCEPT) 125 128 #define IN_DEV_ARP_ANNOUNCE(in_dev) IN_DEV_MAXCONF((in_dev), ARP_ANNOUNCE)
+2 -1
include/net/fib_rules.h
··· 36 36 void *result; 37 37 struct fib_rule *rule; 38 38 int flags; 39 - #define FIB_LOOKUP_NOREF 1 39 + #define FIB_LOOKUP_NOREF 1 40 + #define FIB_LOOKUP_IGNORE_LINKSTATE 2 40 41 }; 41 42 42 43 struct fib_rules_ops {
+9 -7
include/net/ip_fib.h
··· 226 226 } 227 227 228 228 static inline int fib_lookup(struct net *net, const struct flowi4 *flp, 229 - struct fib_result *res) 229 + struct fib_result *res, unsigned int flags) 230 230 { 231 231 struct fib_table *tb; 232 232 int err = -ENETUNREACH; ··· 234 234 rcu_read_lock(); 235 235 236 236 tb = fib_get_table(net, RT_TABLE_MAIN); 237 - if (tb && !fib_table_lookup(tb, flp, res, FIB_LOOKUP_NOREF)) 237 + if (tb && !fib_table_lookup(tb, flp, res, flags | FIB_LOOKUP_NOREF)) 238 238 err = 0; 239 239 240 240 rcu_read_unlock(); ··· 249 249 struct fib_table *fib_new_table(struct net *net, u32 id); 250 250 struct fib_table *fib_get_table(struct net *net, u32 id); 251 251 252 - int __fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res); 252 + int __fib_lookup(struct net *net, struct flowi4 *flp, 253 + struct fib_result *res, unsigned int flags); 253 254 254 255 static inline int fib_lookup(struct net *net, struct flowi4 *flp, 255 - struct fib_result *res) 256 + struct fib_result *res, unsigned int flags) 256 257 { 257 258 struct fib_table *tb; 258 259 int err; 259 260 261 + flags |= FIB_LOOKUP_NOREF; 260 262 if (net->ipv4.fib_has_custom_rules) 261 - return __fib_lookup(net, flp, res); 263 + return __fib_lookup(net, flp, res, flags); 262 264 263 265 rcu_read_lock(); 264 266 ··· 268 266 269 267 for (err = 0; !err; err = -ENETUNREACH) { 270 268 tb = rcu_dereference_rtnl(net->ipv4.fib_main); 271 - if (tb && !fib_table_lookup(tb, flp, res, FIB_LOOKUP_NOREF)) 269 + if (tb && !fib_table_lookup(tb, flp, res, flags)) 272 270 break; 273 271 274 272 tb = rcu_dereference_rtnl(net->ipv4.fib_default); 275 - if (tb && !fib_table_lookup(tb, flp, res, FIB_LOOKUP_NOREF)) 273 + if (tb && !fib_table_lookup(tb, flp, res, flags)) 276 274 break; 277 275 } 278 276
+1
include/uapi/linux/ip.h
··· 164 164 IPV4_DEVCONF_ROUTE_LOCALNET, 165 165 IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL, 166 166 IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL, 167 + IPV4_DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN, 167 168 __IPV4_DEVCONF_MAX 168 169 }; 169 170
+2
net/ipv4/devinet.c
··· 2169 2169 "igmpv2_unsolicited_report_interval"), 2170 2170 DEVINET_SYSCTL_RW_ENTRY(IGMPV3_UNSOLICITED_REPORT_INTERVAL, 2171 2171 "igmpv3_unsolicited_report_interval"), 2172 + DEVINET_SYSCTL_RW_ENTRY(IGNORE_ROUTES_WITH_LINKDOWN, 2173 + "ignore_routes_with_linkdown"), 2172 2174 2173 2175 DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"), 2174 2176 DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"),
+3 -3
net/ipv4/fib_frontend.c
··· 280 280 fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos); 281 281 fl4.flowi4_scope = scope; 282 282 fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0; 283 - if (!fib_lookup(net, &fl4, &res)) 283 + if (!fib_lookup(net, &fl4, &res, 0)) 284 284 return FIB_RES_PREFSRC(net, res); 285 285 } else { 286 286 scope = RT_SCOPE_LINK; ··· 319 319 fl4.flowi4_mark = IN_DEV_SRC_VMARK(idev) ? skb->mark : 0; 320 320 321 321 net = dev_net(dev); 322 - if (fib_lookup(net, &fl4, &res)) 322 + if (fib_lookup(net, &fl4, &res, 0)) 323 323 goto last_resort; 324 324 if (res.type != RTN_UNICAST && 325 325 (res.type != RTN_LOCAL || !IN_DEV_ACCEPT_LOCAL(idev))) ··· 354 354 fl4.flowi4_oif = dev->ifindex; 355 355 356 356 ret = 0; 357 - if (fib_lookup(net, &fl4, &res) == 0) { 357 + if (fib_lookup(net, &fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE) == 0) { 358 358 if (res.type == RTN_UNICAST) 359 359 ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; 360 360 }
+3 -2
net/ipv4/fib_rules.c
··· 47 47 #endif 48 48 }; 49 49 50 - int __fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res) 50 + int __fib_lookup(struct net *net, struct flowi4 *flp, 51 + struct fib_result *res, unsigned int flags) 51 52 { 52 53 struct fib_lookup_arg arg = { 53 54 .result = res, 54 - .flags = FIB_LOOKUP_NOREF, 55 + .flags = flags, 55 56 }; 56 57 int err; 57 58
+28 -5
net/ipv4/fib_semantics.c
··· 623 623 /* It is not necessary, but requires a bit of thinking */ 624 624 if (fl4.flowi4_scope < RT_SCOPE_LINK) 625 625 fl4.flowi4_scope = RT_SCOPE_LINK; 626 - err = fib_lookup(net, &fl4, &res); 626 + err = fib_lookup(net, &fl4, &res, 627 + FIB_LOOKUP_IGNORE_LINKSTATE); 627 628 if (err) { 628 629 rcu_read_unlock(); 629 630 return err; ··· 1036 1035 nla_put_in_addr(skb, RTA_PREFSRC, fi->fib_prefsrc)) 1037 1036 goto nla_put_failure; 1038 1037 if (fi->fib_nhs == 1) { 1038 + struct in_device *in_dev; 1039 + 1039 1040 if (fi->fib_nh->nh_gw && 1040 1041 nla_put_in_addr(skb, RTA_GATEWAY, fi->fib_nh->nh_gw)) 1041 1042 goto nla_put_failure; 1042 1043 if (fi->fib_nh->nh_oif && 1043 1044 nla_put_u32(skb, RTA_OIF, fi->fib_nh->nh_oif)) 1044 1045 goto nla_put_failure; 1046 + if (fi->fib_nh->nh_flags & RTNH_F_LINKDOWN) { 1047 + in_dev = __in_dev_get_rcu(fi->fib_nh->nh_dev); 1048 + if (in_dev && 1049 + IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev)) 1050 + rtm->rtm_flags |= RTNH_F_DEAD; 1051 + } 1045 1052 #ifdef CONFIG_IP_ROUTE_CLASSID 1046 1053 if (fi->fib_nh[0].nh_tclassid && 1047 1054 nla_put_u32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid)) ··· 1066 1057 goto nla_put_failure; 1067 1058 1068 1059 for_nexthops(fi) { 1060 + struct in_device *in_dev; 1061 + 1069 1062 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh)); 1070 1063 if (!rtnh) 1071 1064 goto nla_put_failure; 1072 1065 1073 1066 rtnh->rtnh_flags = nh->nh_flags & 0xFF; 1067 + if (nh->nh_flags & RTNH_F_LINKDOWN) { 1068 + in_dev = __in_dev_get_rcu(nh->nh_dev); 1069 + if (in_dev && 1070 + IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev)) 1071 + rtnh->rtnh_flags |= RTNH_F_DEAD; 1072 + } 1074 1073 rtnh->rtnh_hops = nh->nh_weight - 1; 1075 1074 rtnh->rtnh_ifindex = nh->nh_oif; 1076 1075 ··· 1327 1310 void fib_select_multipath(struct fib_result *res) 1328 1311 { 1329 1312 struct fib_info *fi = res->fi; 1313 + struct in_device *in_dev; 1330 1314 int w; 1331 1315 1332 1316 spin_lock_bh(&fib_multipath_lock); 1333 1317 if (fi->fib_power <= 0) { 1334 1318 int power = 0; 1335 1319 change_nexthops(fi) { 1336 - if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) { 1337 - power += nexthop_nh->nh_weight; 1338 - nexthop_nh->nh_power = nexthop_nh->nh_weight; 1339 - } 1320 + in_dev = __in_dev_get_rcu(nexthop_nh->nh_dev); 1321 + if (nexthop_nh->nh_flags & RTNH_F_DEAD) 1322 + continue; 1323 + if (in_dev && 1324 + IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) && 1325 + nexthop_nh->nh_flags & RTNH_F_LINKDOWN) 1326 + continue; 1327 + power += nexthop_nh->nh_weight; 1328 + nexthop_nh->nh_power = nexthop_nh->nh_weight; 1340 1329 } endfor_nexthops(fi); 1341 1330 fi->fib_power = power; 1342 1331 if (power <= 0) {
+6
net/ipv4/fib_trie.c
··· 1412 1412 continue; 1413 1413 for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) { 1414 1414 const struct fib_nh *nh = &fi->fib_nh[nhsel]; 1415 + struct in_device *in_dev = __in_dev_get_rcu(nh->nh_dev); 1415 1416 1416 1417 if (nh->nh_flags & RTNH_F_DEAD) 1418 + continue; 1419 + if (in_dev && 1420 + IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) && 1421 + nh->nh_flags & RTNH_F_LINKDOWN && 1422 + !(fib_flags & FIB_LOOKUP_IGNORE_LINKSTATE)) 1417 1423 continue; 1418 1424 if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif) 1419 1425 continue;
+1 -1
net/ipv4/netfilter/ipt_rpfilter.c
··· 40 40 struct net *net = dev_net(dev); 41 41 int ret __maybe_unused; 42 42 43 - if (fib_lookup(net, fl4, &res)) 43 + if (fib_lookup(net, fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE)) 44 44 return false; 45 45 46 46 if (res.type != RTN_UNICAST) {
+5 -5
net/ipv4/route.c
··· 747 747 if (!(n->nud_state & NUD_VALID)) { 748 748 neigh_event_send(n, NULL); 749 749 } else { 750 - if (fib_lookup(net, fl4, &res) == 0) { 750 + if (fib_lookup(net, fl4, &res, 0) == 0) { 751 751 struct fib_nh *nh = &FIB_RES_NH(res); 752 752 753 753 update_or_create_fnhe(nh, fl4->daddr, new_gw, ··· 975 975 return; 976 976 977 977 rcu_read_lock(); 978 - if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) { 978 + if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) { 979 979 struct fib_nh *nh = &FIB_RES_NH(res); 980 980 981 981 update_or_create_fnhe(nh, fl4->daddr, 0, mtu, ··· 1186 1186 fl4.flowi4_mark = skb->mark; 1187 1187 1188 1188 rcu_read_lock(); 1189 - if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0) 1189 + if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0) 1190 1190 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res); 1191 1191 else 1192 1192 src = inet_select_addr(rt->dst.dev, ··· 1716 1716 fl4.flowi4_scope = RT_SCOPE_UNIVERSE; 1717 1717 fl4.daddr = daddr; 1718 1718 fl4.saddr = saddr; 1719 - err = fib_lookup(net, &fl4, &res); 1719 + err = fib_lookup(net, &fl4, &res, 0); 1720 1720 if (err != 0) { 1721 1721 if (!IN_DEV_FORWARD(in_dev)) 1722 1722 err = -EHOSTUNREACH; ··· 2123 2123 goto make_route; 2124 2124 } 2125 2125 2126 - if (fib_lookup(net, fl4, &res)) { 2126 + if (fib_lookup(net, fl4, &res, 0)) { 2127 2127 res.fi = NULL; 2128 2128 res.table = NULL; 2129 2129 if (fl4->flowi4_oif) {