Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

net: mctp: add gateway routing support

This change allows for gateway routing, where a route table entry
may reference a routable endpoint (by network and EID), instead of
routing directly to a netdevice.

We add support for a RTM_GATEWAY attribute for netlink route updates,
with an attribute format of:

struct mctp_fq_addr {
unsigned int net;
mctp_eid_t eid;
}

- we need the net here to uniquely identify the target EID, as we no
longer have the device reference directly (which would provide the net
id in the case of direct routes).

This makes route lookups recursive, as a route lookup that returns a
gateway route must be resolved into a direct route (ie, to a device)
eventually. We provide a limit to the route lookups, to prevent infinite
loop routing.

The route lookup populates a new 'nexthop' field in the dst structure,
which now specifies the key for the neighbour table lookup on device
output, rather than using the packet destination address directly.

Signed-off-by: Jeremy Kerr <jk@codeconstruct.com.au>
Link: https://patch.msgid.link/20250702-dev-forwarding-v5-13-1468191da8a4@codeconstruct.com.au
Signed-off-by: Paolo Abeni <pabeni@redhat.com>

authored by

Jeremy Kerr and committed by
Paolo Abeni
ad39c12f 28ddbb2a

+173 -57
+12 -1
include/net/mctp.h
··· 237 237 mctp_eid_t min, max; 238 238 239 239 unsigned char type; 240 + 240 241 unsigned int mtu; 241 - struct mctp_dev *dev; 242 + 243 + enum { 244 + MCTP_ROUTE_DIRECT, 245 + MCTP_ROUTE_GATEWAY, 246 + } dst_type; 247 + union { 248 + struct mctp_dev *dev; 249 + struct mctp_fq_addr gateway; 250 + }; 251 + 242 252 int (*output)(struct mctp_dst *dst, 243 253 struct sk_buff *skb); 244 254 ··· 266 256 struct mctp_dst { 267 257 struct mctp_dev *dev; 268 258 unsigned int mtu; 259 + mctp_eid_t nexthop; 269 260 270 261 /* set for direct addressing */ 271 262 unsigned char halen;
+8
include/uapi/linux/mctp.h
··· 37 37 __u8 smctp_haddr[MAX_ADDR_LEN]; 38 38 }; 39 39 40 + /* A "fully qualified" MCTP address, which includes the system-local network ID, 41 + * required to uniquely resolve a routable EID. 42 + */ 43 + struct mctp_fq_addr { 44 + unsigned int net; 45 + mctp_eid_t eid; 46 + }; 47 + 40 48 #define MCTP_NET_ANY 0x0 41 49 42 50 #define MCTP_ADDR_NULL 0x00
+151 -55
net/mctp/route.c
··· 563 563 564 564 static int mctp_dst_output(struct mctp_dst *dst, struct sk_buff *skb) 565 565 { 566 - struct mctp_hdr *hdr = mctp_hdr(skb); 567 566 char daddr_buf[MAX_ADDR_LEN]; 568 567 char *daddr = NULL; 569 568 int rc; ··· 585 586 daddr = dst->haddr; 586 587 } else { 587 588 /* If lookup fails let the device handle daddr==NULL */ 588 - if (mctp_neigh_lookup(dst->dev, hdr->dest, daddr_buf) == 0) 589 + if (mctp_neigh_lookup(dst->dev, dst->nexthop, daddr_buf) == 0) 589 590 daddr = daddr_buf; 590 591 } 591 592 ··· 609 610 static void mctp_route_release(struct mctp_route *rt) 610 611 { 611 612 if (refcount_dec_and_test(&rt->refs)) { 612 - mctp_dev_put(rt->dev); 613 + if (rt->dst_type == MCTP_ROUTE_DIRECT) 614 + mctp_dev_put(rt->dev); 613 615 kfree_rcu(rt, rcu); 614 616 } 615 617 } ··· 799 799 } 800 800 801 801 /* routing lookups */ 802 + static unsigned int mctp_route_netid(struct mctp_route *rt) 803 + { 804 + return rt->dst_type == MCTP_ROUTE_DIRECT ? 805 + READ_ONCE(rt->dev->net) : rt->gateway.net; 806 + } 807 + 802 808 static bool mctp_rt_match_eid(struct mctp_route *rt, 803 809 unsigned int net, mctp_eid_t eid) 804 810 { 805 - return READ_ONCE(rt->dev->net) == net && 811 + return mctp_route_netid(rt) == net && 806 812 rt->min <= eid && rt->max >= eid; 807 813 } 808 814 ··· 817 811 struct mctp_route *rt2) 818 812 { 819 813 ASSERT_RTNL(); 820 - return rt1->dev->net == rt2->dev->net && 814 + return mctp_route_netid(rt1) == mctp_route_netid(rt2) && 821 815 rt1->min == rt2->min && 822 816 rt1->max == rt2->max; 823 817 } 824 818 825 - static void mctp_dst_from_route(struct mctp_dst *dst, struct mctp_route *route) 819 + /* must only be called on a direct route, as the final output hop */ 820 + static void mctp_dst_from_route(struct mctp_dst *dst, mctp_eid_t eid, 821 + unsigned int mtu, struct mctp_route *route) 826 822 { 827 823 mctp_dev_hold(route->dev); 824 + dst->nexthop = eid; 828 825 dst->dev = route->dev; 829 - dst->mtu = route->mtu ?: READ_ONCE(dst->dev->dev->mtu); 826 + dst->mtu = READ_ONCE(dst->dev->dev->mtu); 827 + if (mtu) 828 + dst->mtu = min(dst->mtu, mtu); 830 829 dst->halen = 0; 831 830 dst->output = route->output; 832 831 } ··· 865 854 dst->mtu = READ_ONCE(netdev->mtu); 866 855 dst->halen = halen; 867 856 dst->output = mctp_dst_output; 857 + dst->nexthop = 0; 868 858 memcpy(dst->haddr, haddr, halen); 869 859 870 860 rc = 0; ··· 880 868 mctp_dev_put(dst->dev); 881 869 } 882 870 871 + static struct mctp_route *mctp_route_lookup_single(struct net *net, 872 + unsigned int dnet, 873 + mctp_eid_t daddr) 874 + { 875 + struct mctp_route *rt; 876 + 877 + list_for_each_entry_rcu(rt, &net->mctp.routes, list) { 878 + if (mctp_rt_match_eid(rt, dnet, daddr)) 879 + return rt; 880 + } 881 + 882 + return NULL; 883 + } 884 + 883 885 /* populates *dst on successful lookup, if set */ 884 886 int mctp_route_lookup(struct net *net, unsigned int dnet, 885 887 mctp_eid_t daddr, struct mctp_dst *dst) 886 888 { 889 + const unsigned int max_depth = 32; 890 + unsigned int depth, mtu = 0; 887 891 int rc = -EHOSTUNREACH; 888 - struct mctp_route *rt; 889 892 890 893 rcu_read_lock(); 891 894 892 - list_for_each_entry_rcu(rt, &net->mctp.routes, list) { 893 - /* TODO: add metrics */ 894 - if (!mctp_rt_match_eid(rt, dnet, daddr)) 895 - continue; 895 + for (depth = 0; depth < max_depth; depth++) { 896 + struct mctp_route *rt; 896 897 897 - if (dst) 898 - mctp_dst_from_route(dst, rt); 899 - rc = 0; 900 - break; 898 + rt = mctp_route_lookup_single(net, dnet, daddr); 899 + if (!rt) 900 + break; 901 + 902 + /* clamp mtu to the smallest in the path, allowing 0 903 + * to specify no restrictions 904 + */ 905 + if (mtu && rt->mtu) 906 + mtu = min(mtu, rt->mtu); 907 + else 908 + mtu = mtu ?: rt->mtu; 909 + 910 + if (rt->dst_type == MCTP_ROUTE_DIRECT) { 911 + if (dst) 912 + mctp_dst_from_route(dst, daddr, mtu, rt); 913 + rc = 0; 914 + break; 915 + 916 + } else if (rt->dst_type == MCTP_ROUTE_GATEWAY) { 917 + daddr = rt->gateway.eid; 918 + } 901 919 } 902 920 903 921 rcu_read_unlock(); ··· 944 902 rcu_read_lock(); 945 903 946 904 list_for_each_entry_rcu(rt, &net->mctp.routes, list) { 947 - if (rt->dev->dev != dev || rt->type != RTN_LOCAL) 905 + if (rt->dst_type != MCTP_ROUTE_DIRECT || rt->type != RTN_LOCAL) 948 906 continue; 949 907 950 - mctp_dst_from_route(dst, rt); 908 + if (rt->dev->dev != dev) 909 + continue; 910 + 911 + mctp_dst_from_route(dst, 0, 0, rt); 951 912 rc = 0; 952 913 break; 953 914 } ··· 1130 1085 return rc; 1131 1086 } 1132 1087 1133 - static unsigned int mctp_route_netid(struct mctp_route *rt) 1134 - { 1135 - return rt->dev->net; 1136 - } 1137 - 1138 1088 /* route management */ 1139 1089 1140 1090 /* mctp_route_add(): Add the provided route, previously allocated via ··· 1137 1097 * hold on rt->dev for usage in the route table. On failure a caller will want 1138 1098 * to mctp_route_release(). 1139 1099 * 1140 - * We expect that the caller has set rt->type, rt->min, rt->max, rt->dev and 1141 - * rt->mtu, and that the route holds a reference to rt->dev (via mctp_dev_hold). 1142 - * Other fields will be populated. 1100 + * We expect that the caller has set rt->type, rt->dst_type, rt->min, rt->max, 1101 + * rt->mtu and either rt->dev (with a reference held appropriately) or 1102 + * rt->gateway. Other fields will be populated. 1143 1103 */ 1144 1104 static int mctp_route_add(struct net *net, struct mctp_route *rt) 1145 1105 { ··· 1148 1108 if (!mctp_address_unicast(rt->min) || !mctp_address_unicast(rt->max)) 1149 1109 return -EINVAL; 1150 1110 1151 - if (!rt->dev) 1111 + if (rt->dst_type == MCTP_ROUTE_DIRECT && !rt->dev) 1112 + return -EINVAL; 1113 + 1114 + if (rt->dst_type == MCTP_ROUTE_GATEWAY && !rt->gateway.eid) 1152 1115 return -EINVAL; 1153 1116 1154 1117 switch (rt->type) { ··· 1220 1177 1221 1178 rt->min = addr; 1222 1179 rt->max = addr; 1180 + rt->dst_type = MCTP_ROUTE_DIRECT; 1223 1181 rt->dev = mdev; 1224 1182 rt->type = RTN_LOCAL; 1225 1183 ··· 1247 1203 1248 1204 ASSERT_RTNL(); 1249 1205 list_for_each_entry_safe(rt, tmp, &net->mctp.routes, list) { 1250 - if (rt->dev == mdev) { 1206 + if (rt->dst_type == MCTP_ROUTE_DIRECT && rt->dev == mdev) { 1251 1207 list_del_rcu(&rt->list); 1252 1208 /* TODO: immediate RTM_DELROUTE */ 1253 1209 mctp_route_release(rt); ··· 1340 1296 [RTA_DST] = { .type = NLA_U8 }, 1341 1297 [RTA_METRICS] = { .type = NLA_NESTED }, 1342 1298 [RTA_OIF] = { .type = NLA_U32 }, 1299 + [RTA_GATEWAY] = NLA_POLICY_EXACT_LEN(sizeof(struct mctp_fq_addr)), 1343 1300 }; 1344 1301 1345 1302 static const struct nla_policy rta_metrics_policy[RTAX_MAX + 1] = { 1346 1303 [RTAX_MTU] = { .type = NLA_U32 }, 1347 1304 }; 1348 1305 1349 - /* base parsing; common to both _lookup and _populate variants */ 1306 + /* base parsing; common to both _lookup and _populate variants. 1307 + * 1308 + * For gateway routes (which have a RTA_GATEWAY, and no RTA_OIF), we populate 1309 + * *gatweayp. for direct routes (RTA_OIF, no RTA_GATEWAY), we populate *mdev. 1310 + */ 1350 1311 static int mctp_route_nlparse_common(struct net *net, struct nlmsghdr *nlh, 1351 1312 struct netlink_ext_ack *extack, 1352 1313 struct nlattr **tb, struct rtmsg **rtm, 1353 1314 struct mctp_dev **mdev, 1315 + struct mctp_fq_addr *gatewayp, 1354 1316 mctp_eid_t *daddr_start) 1355 1317 { 1318 + struct mctp_fq_addr *gateway = NULL; 1319 + unsigned int ifindex = 0; 1356 1320 struct net_device *dev; 1357 - unsigned int ifindex; 1358 1321 int rc; 1359 1322 1360 1323 rc = nlmsg_parse(nlh, sizeof(struct rtmsg), tb, RTA_MAX, ··· 1377 1326 } 1378 1327 *daddr_start = nla_get_u8(tb[RTA_DST]); 1379 1328 1380 - if (!tb[RTA_OIF]) { 1381 - NL_SET_ERR_MSG(extack, "ifindex missing"); 1329 + if (tb[RTA_OIF]) 1330 + ifindex = nla_get_u32(tb[RTA_OIF]); 1331 + 1332 + if (tb[RTA_GATEWAY]) 1333 + gateway = nla_data(tb[RTA_GATEWAY]); 1334 + 1335 + if (ifindex && gateway) { 1336 + NL_SET_ERR_MSG(extack, 1337 + "cannot specify both ifindex and gateway"); 1338 + return -EINVAL; 1339 + 1340 + } else if (ifindex) { 1341 + dev = __dev_get_by_index(net, ifindex); 1342 + if (!dev) { 1343 + NL_SET_ERR_MSG(extack, "bad ifindex"); 1344 + return -ENODEV; 1345 + } 1346 + *mdev = mctp_dev_get_rtnl(dev); 1347 + if (!*mdev) 1348 + return -ENODEV; 1349 + gatewayp->eid = 0; 1350 + 1351 + } else if (gateway) { 1352 + if (!mctp_address_unicast(gateway->eid)) { 1353 + NL_SET_ERR_MSG(extack, "bad gateway"); 1354 + return -EINVAL; 1355 + } 1356 + 1357 + gatewayp->eid = gateway->eid; 1358 + gatewayp->net = gateway->net != MCTP_NET_ANY ? 1359 + gateway->net : 1360 + READ_ONCE(net->mctp.default_net); 1361 + *mdev = NULL; 1362 + 1363 + } else { 1364 + NL_SET_ERR_MSG(extack, "no route output provided"); 1382 1365 return -EINVAL; 1383 1366 } 1384 - ifindex = nla_get_u32(tb[RTA_OIF]); 1385 1367 1386 1368 *rtm = nlmsg_data(nlh); 1387 1369 if ((*rtm)->rtm_family != AF_MCTP) { ··· 1426 1342 NL_SET_ERR_MSG(extack, "rtm_type must be RTN_UNICAST"); 1427 1343 return -EINVAL; 1428 1344 } 1429 - 1430 - dev = __dev_get_by_index(net, ifindex); 1431 - if (!dev) { 1432 - NL_SET_ERR_MSG(extack, "bad ifindex"); 1433 - return -ENODEV; 1434 - } 1435 - 1436 - *mdev = mctp_dev_get_rtnl(dev); 1437 - if (!*mdev) 1438 - return -ENODEV; 1439 1345 1440 1346 return 0; 1441 1347 } ··· 1440 1366 unsigned int *daddr_extent) 1441 1367 { 1442 1368 struct nlattr *tb[RTA_MAX + 1]; 1369 + struct mctp_fq_addr gw; 1443 1370 struct mctp_dev *mdev; 1444 1371 struct rtmsg *rtm; 1445 1372 int rc; 1446 1373 1447 1374 rc = mctp_route_nlparse_common(net, nlh, extack, tb, &rtm, 1448 - &mdev, daddr_start); 1375 + &mdev, &gw, daddr_start); 1449 1376 if (rc) 1450 1377 return rc; 1451 1378 1452 - *netid = mdev->net; 1379 + if (mdev) { 1380 + *netid = mdev->net; 1381 + } else if (gw.eid) { 1382 + *netid = gw.net; 1383 + } else { 1384 + /* bug: _nlparse_common should not allow this */ 1385 + return -1; 1386 + } 1387 + 1453 1388 *type = rtm->rtm_type; 1454 1389 *daddr_extent = rtm->rtm_dst_len; 1455 1390 1456 1391 return 0; 1457 1392 } 1458 1393 1459 - /* Full route parse for RTM_NEWROUTE: populate @rt. On success, the route will 1460 - * hold a reference to the dev. 1394 + /* Full route parse for RTM_NEWROUTE: populate @rt. On success, 1395 + * MCTP_ROUTE_DIRECT routes (ie, those with a direct dev) will hold a reference 1396 + * to that dev. 1461 1397 */ 1462 1398 static int mctp_route_nlparse_populate(struct net *net, struct nlmsghdr *nlh, 1463 1399 struct netlink_ext_ack *extack, ··· 1476 1392 struct nlattr *tbx[RTAX_MAX + 1]; 1477 1393 struct nlattr *tb[RTA_MAX + 1]; 1478 1394 unsigned int daddr_extent; 1395 + struct mctp_fq_addr gw; 1479 1396 mctp_eid_t daddr_start; 1480 1397 struct mctp_dev *dev; 1481 1398 struct rtmsg *rtm; ··· 1484 1399 int rc; 1485 1400 1486 1401 rc = mctp_route_nlparse_common(net, nlh, extack, tb, &rtm, 1487 - &dev, &daddr_start); 1402 + &dev, &gw, &daddr_start); 1488 1403 if (rc) 1489 1404 return rc; 1490 1405 ··· 1510 1425 rt->min = daddr_start; 1511 1426 rt->max = daddr_start + daddr_extent; 1512 1427 rt->mtu = mtu; 1513 - rt->dev = dev; 1514 - mctp_dev_hold(rt->dev); 1428 + if (gw.eid) { 1429 + rt->dst_type = MCTP_ROUTE_GATEWAY; 1430 + rt->gateway.eid = gw.eid; 1431 + rt->gateway.net = gw.net; 1432 + } else { 1433 + rt->dst_type = MCTP_ROUTE_DIRECT; 1434 + rt->dev = dev; 1435 + mctp_dev_hold(rt->dev); 1436 + } 1515 1437 1516 1438 return 0; 1517 1439 } ··· 1538 1446 if (rc < 0) 1539 1447 goto err_free; 1540 1448 1541 - if (rt->dev->dev->flags & IFF_LOOPBACK) { 1449 + if (rt->dst_type == MCTP_ROUTE_DIRECT && 1450 + rt->dev->dev->flags & IFF_LOOPBACK) { 1542 1451 NL_SET_ERR_MSG(extack, "no routes to loopback"); 1543 1452 rc = -EINVAL; 1544 1453 goto err_free; ··· 1598 1505 hdr->rtm_tos = 0; 1599 1506 hdr->rtm_table = RT_TABLE_DEFAULT; 1600 1507 hdr->rtm_protocol = RTPROT_STATIC; /* everything is user-defined */ 1601 - hdr->rtm_scope = RT_SCOPE_LINK; /* TODO: scope in mctp_route? */ 1602 1508 hdr->rtm_type = rt->type; 1603 1509 1604 1510 if (nla_put_u8(skb, RTA_DST, rt->min)) ··· 1614 1522 1615 1523 nla_nest_end(skb, metrics); 1616 1524 1617 - if (rt->dev) { 1525 + if (rt->dst_type == MCTP_ROUTE_DIRECT) { 1526 + hdr->rtm_scope = RT_SCOPE_LINK; 1618 1527 if (nla_put_u32(skb, RTA_OIF, rt->dev->dev->ifindex)) 1619 1528 goto cancel; 1529 + } else if (rt->dst_type == MCTP_ROUTE_GATEWAY) { 1530 + hdr->rtm_scope = RT_SCOPE_UNIVERSE; 1531 + if (nla_put(skb, RTA_GATEWAY, 1532 + sizeof(rt->gateway), &rt->gateway)) 1533 + goto cancel; 1620 1534 } 1621 - 1622 - /* TODO: conditional neighbour physaddr? */ 1623 1535 1624 1536 nlmsg_end(skb, nlh); 1625 1537
+2 -1
net/mctp/test/utils.c
··· 134 134 rt->rt.max = eid; 135 135 rt->rt.mtu = mtu; 136 136 rt->rt.type = RTN_UNSPEC; 137 + rt->rt.dst_type = MCTP_ROUTE_DIRECT; 137 138 if (dev) 138 139 mctp_dev_hold(dev); 139 140 rt->rt.dev = dev; ··· 177 176 list_del_rcu(&rt->rt.list); 178 177 rtnl_unlock(); 179 178 180 - if (rt->rt.dev) 179 + if (rt->rt.dst_type == MCTP_ROUTE_DIRECT && rt->rt.dev) 181 180 mctp_dev_put(rt->rt.dev); 182 181 183 182 refs = refcount_read(&rt->rt.refs);