Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

RDMA/nldev: Add support for RDMA monitoring

Introduce a new netlink command to allow rdma event monitoring.
The rdma events supported now are IB device
registration/unregistration and net device attachment/detachment.

Example output of rdma monitor and the commands which trigger
the events:

$ rdma monitor
$ rmmod mlx5_ib
[UNREGISTER] dev 1 rocep8s0f1
[UNREGISTER] dev 0 rocep8s0f0

$ modprobe mlx5_ib
[REGISTER] dev 2 mlx5_0
[NETDEV_ATTACH] dev 2 mlx5_0 port 1 netdev 4 eth2
[REGISTER] dev 3 mlx5_1
[NETDEV_ATTACH] dev 3 mlx5_1 port 1 netdev 5 eth3

$ devlink dev eswitch set pci/0000:08:00.0 mode switchdev
[UNREGISTER] dev 2 rocep8s0f0
[REGISTER] dev 4 mlx5_0
[NETDEV_ATTACH] dev 4 mlx5_0 port 30 netdev 4 eth2

$ echo 4 > /sys/class/net/eth2/device/sriov_numvfs
[NETDEV_ATTACH] dev 4 rdmap8s0f0 port 2 netdev 7 eth4
[NETDEV_ATTACH] dev 4 rdmap8s0f0 port 3 netdev 8 eth5
[NETDEV_ATTACH] dev 4 rdmap8s0f0 port 4 netdev 9 eth6
[NETDEV_ATTACH] dev 4 rdmap8s0f0 port 5 netdev 10 eth7
[REGISTER] dev 5 mlx5_0
[NETDEV_ATTACH] dev 5 mlx5_0 port 1 netdev 11 eth8
[REGISTER] dev 6 mlx5_0
[NETDEV_ATTACH] dev 6 mlx5_0 port 1 netdev 12 eth9
[REGISTER] dev 7 mlx5_0
[NETDEV_ATTACH] dev 7 mlx5_0 port 1 netdev 13 eth10
[REGISTER] dev 8 mlx5_0
[NETDEV_ATTACH] dev 8 mlx5_0 port 1 netdev 14 eth11

$ echo 0 > /sys/class/net/eth2/device/sriov_numvfs
[UNREGISTER] dev 5 rocep8s0f0v0
[UNREGISTER] dev 6 rocep8s0f0v1
[UNREGISTER] dev 7 rocep8s0f0v2
[UNREGISTER] dev 8 rocep8s0f0v3
[NETDEV_DETACH] dev 4 rdmap8s0f0 port 2
[NETDEV_DETACH] dev 4 rdmap8s0f0 port 3
[NETDEV_DETACH] dev 4 rdmap8s0f0 port 4
[NETDEV_DETACH] dev 4 rdmap8s0f0 port 5

Signed-off-by: Chiara Meiohas <cmeiohas@nvidia.com>
Signed-off-by: Michael Guralnik <michaelgur@nvidia.com>
Link: https://patch.msgid.link/20240909173025.30422-7-michaelgur@nvidia.com
Signed-off-by: Leon Romanovsky <leon@kernel.org>

authored by

Chiara Meiohas and committed by
Leon Romanovsky
9cbed5aa 8d159eb2

+187
+35
drivers/infiniband/core/device.c
··· 1351 1351 { 1352 1352 } 1353 1353 1354 + static void ib_device_notify_register(struct ib_device *device) 1355 + { 1356 + struct net_device *netdev; 1357 + u32 port; 1358 + int ret; 1359 + 1360 + ret = rdma_nl_notify_event(device, 0, RDMA_REGISTER_EVENT); 1361 + if (ret) 1362 + return; 1363 + 1364 + rdma_for_each_port(device, port) { 1365 + netdev = ib_device_get_netdev(device, port); 1366 + if (!netdev) 1367 + continue; 1368 + 1369 + ret = rdma_nl_notify_event(device, port, 1370 + RDMA_NETDEV_ATTACH_EVENT); 1371 + dev_put(netdev); 1372 + if (ret) 1373 + return; 1374 + } 1375 + } 1376 + 1354 1377 /** 1355 1378 * ib_register_device - Register an IB device with IB core 1356 1379 * @device: Device to register ··· 1472 1449 dev_set_uevent_suppress(&device->dev, false); 1473 1450 /* Mark for userspace that device is ready */ 1474 1451 kobject_uevent(&device->dev.kobj, KOBJ_ADD); 1452 + 1453 + ib_device_notify_register(device); 1475 1454 ib_device_put(device); 1476 1455 1477 1456 return 0; ··· 1516 1491 goto out; 1517 1492 1518 1493 disable_device(ib_dev); 1494 + rdma_nl_notify_event(ib_dev, 0, RDMA_UNREGISTER_EVENT); 1519 1495 1520 1496 /* Expedite removing unregistered pointers from the hash table */ 1521 1497 free_netdevs(ib_dev); ··· 2185 2159 int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev, 2186 2160 u32 port) 2187 2161 { 2162 + enum rdma_nl_notify_event_type etype; 2188 2163 struct net_device *old_ndev; 2189 2164 struct ib_port_data *pdata; 2190 2165 unsigned long flags; ··· 2217 2190 spin_unlock_irqrestore(&pdata->netdev_lock, flags); 2218 2191 2219 2192 add_ndev_hash(pdata); 2193 + 2194 + /* Make sure that the device is registered before we send events */ 2195 + if (xa_load(&devices, ib_dev->index) != ib_dev) 2196 + return 0; 2197 + 2198 + etype = ndev ? RDMA_NETDEV_ATTACH_EVENT : RDMA_NETDEV_DETACH_EVENT; 2199 + rdma_nl_notify_event(ib_dev, port, etype); 2200 + 2220 2201 return 0; 2221 2202 } 2222 2203 EXPORT_SYMBOL(ib_device_set_netdev);
+1
drivers/infiniband/core/netlink.c
··· 311 311 struct net *net = read_pnet(&rnet->net); 312 312 struct netlink_kernel_cfg cfg = { 313 313 .input = rdma_nl_rcv, 314 + .flags = NL_CFG_F_NONROOT_RECV, 314 315 }; 315 316 struct sock *nls; 316 317
+124
drivers/infiniband/core/nldev.c
··· 170 170 [RDMA_NLDEV_ATTR_DEV_TYPE] = { .type = NLA_U8 }, 171 171 [RDMA_NLDEV_ATTR_PARENT_NAME] = { .type = NLA_NUL_STRING }, 172 172 [RDMA_NLDEV_ATTR_NAME_ASSIGN_TYPE] = { .type = NLA_U8 }, 173 + [RDMA_NLDEV_ATTR_EVENT_TYPE] = { .type = NLA_U8 }, 173 174 }; 174 175 175 176 static int put_driver_name_print_type(struct sk_buff *msg, const char *name, ··· 2722 2721 .flags = RDMA_NL_ADMIN_PERM, 2723 2722 }, 2724 2723 }; 2724 + 2725 + static int fill_mon_netdev_association(struct sk_buff *msg, 2726 + struct ib_device *device, u32 port, 2727 + const struct net *net) 2728 + { 2729 + struct net_device *netdev = ib_device_get_netdev(device, port); 2730 + int ret = 0; 2731 + 2732 + if (netdev && !net_eq(dev_net(netdev), net)) 2733 + goto out; 2734 + 2735 + ret = nla_put_u32(msg, RDMA_NLDEV_ATTR_DEV_INDEX, device->index); 2736 + if (ret) 2737 + goto out; 2738 + 2739 + ret = nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_NAME, 2740 + dev_name(&device->dev)); 2741 + if (ret) 2742 + goto out; 2743 + 2744 + ret = nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port); 2745 + if (ret) 2746 + goto out; 2747 + 2748 + if (netdev) { 2749 + ret = nla_put_u32(msg, 2750 + RDMA_NLDEV_ATTR_NDEV_INDEX, netdev->ifindex); 2751 + if (ret) 2752 + goto out; 2753 + 2754 + ret = nla_put_string(msg, 2755 + RDMA_NLDEV_ATTR_NDEV_NAME, netdev->name); 2756 + } 2757 + 2758 + out: 2759 + dev_put(netdev); 2760 + return ret; 2761 + } 2762 + 2763 + static void rdma_nl_notify_err_msg(struct ib_device *device, u32 port_num, 2764 + enum rdma_nl_notify_event_type type) 2765 + { 2766 + struct net_device *netdev; 2767 + 2768 + switch (type) { 2769 + case RDMA_REGISTER_EVENT: 2770 + dev_warn_ratelimited(&device->dev, 2771 + "Failed to send RDMA monitor register device event\n"); 2772 + break; 2773 + case RDMA_UNREGISTER_EVENT: 2774 + dev_warn_ratelimited(&device->dev, 2775 + "Failed to send RDMA monitor unregister device event\n"); 2776 + break; 2777 + case RDMA_NETDEV_ATTACH_EVENT: 2778 + netdev = ib_device_get_netdev(device, port_num); 2779 + dev_warn_ratelimited(&device->dev, 2780 + "Failed to send RDMA monitor netdev attach event: port %d netdev %d\n", 2781 + port_num, netdev->ifindex); 2782 + dev_put(netdev); 2783 + break; 2784 + case RDMA_NETDEV_DETACH_EVENT: 2785 + dev_warn_ratelimited(&device->dev, 2786 + "Failed to send RDMA monitor netdev detach event: port %d\n", 2787 + port_num); 2788 + default: 2789 + break; 2790 + } 2791 + } 2792 + 2793 + int rdma_nl_notify_event(struct ib_device *device, u32 port_num, 2794 + enum rdma_nl_notify_event_type type) 2795 + { 2796 + struct sk_buff *skb; 2797 + struct net *net; 2798 + int ret = 0; 2799 + void *nlh; 2800 + 2801 + net = read_pnet(&device->coredev.rdma_net); 2802 + if (!net) 2803 + return -EINVAL; 2804 + 2805 + skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); 2806 + if (!skb) 2807 + return -ENOMEM; 2808 + nlh = nlmsg_put(skb, 0, 0, 2809 + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_MONITOR), 2810 + 0, 0); 2811 + 2812 + switch (type) { 2813 + case RDMA_REGISTER_EVENT: 2814 + case RDMA_UNREGISTER_EVENT: 2815 + ret = fill_nldev_handle(skb, device); 2816 + if (ret) 2817 + goto err_free; 2818 + break; 2819 + case RDMA_NETDEV_ATTACH_EVENT: 2820 + case RDMA_NETDEV_DETACH_EVENT: 2821 + ret = fill_mon_netdev_association(skb, device, 2822 + port_num, net); 2823 + if (ret) 2824 + goto err_free; 2825 + break; 2826 + default: 2827 + break; 2828 + } 2829 + 2830 + ret = nla_put_u8(skb, RDMA_NLDEV_ATTR_EVENT_TYPE, type); 2831 + if (ret) 2832 + goto err_free; 2833 + 2834 + nlmsg_end(skb, nlh); 2835 + ret = rdma_nl_multicast(net, skb, RDMA_NL_GROUP_NOTIFY, GFP_KERNEL); 2836 + if (ret && ret != -ESRCH) { 2837 + skb = NULL; /* skb is freed in the netlink send-op handling */ 2838 + goto err_free; 2839 + } 2840 + return 0; 2841 + 2842 + err_free: 2843 + rdma_nl_notify_err_msg(device, port_num, type); 2844 + nlmsg_free(skb); 2845 + return ret; 2846 + } 2725 2847 2726 2848 void __init nldev_init(void) 2727 2849 {
+12
include/rdma/rdma_netlink.h
··· 6 6 #include <linux/netlink.h> 7 7 #include <uapi/rdma/rdma_netlink.h> 8 8 9 + struct ib_device; 10 + 9 11 enum { 10 12 RDMA_NLDEV_ATTR_EMPTY_STRING = 1, 11 13 RDMA_NLDEV_ATTR_ENTRY_STRLEN = 16, ··· 111 109 * Returns true on success or false if no listeners. 112 110 */ 113 111 bool rdma_nl_chk_listeners(unsigned int group); 112 + 113 + /** 114 + * Prepare and send an event message 115 + * @ib: the IB device which triggered the event 116 + * @port_num: the port number which triggered the event - 0 if unused 117 + * @type: the event type 118 + * Returns 0 on success or a negative error code 119 + */ 120 + int rdma_nl_notify_event(struct ib_device *ib, u32 port_num, 121 + enum rdma_nl_notify_event_type type); 114 122 115 123 struct rdma_link_ops { 116 124 struct list_head list;
+15
include/uapi/rdma/rdma_netlink.h
··· 15 15 enum { 16 16 RDMA_NL_GROUP_IWPM = 2, 17 17 RDMA_NL_GROUP_LS, 18 + RDMA_NL_GROUP_NOTIFY, 18 19 RDMA_NL_NUM_GROUPS 19 20 }; 20 21 ··· 306 305 307 306 RDMA_NLDEV_CMD_DELDEV, 308 307 308 + RDMA_NLDEV_CMD_MONITOR, 309 + 309 310 RDMA_NLDEV_NUM_OPS 310 311 }; 311 312 ··· 577 574 578 575 RDMA_NLDEV_ATTR_NAME_ASSIGN_TYPE, /* u8 */ 579 576 577 + RDMA_NLDEV_ATTR_EVENT_TYPE, /* u8 */ 578 + 580 579 /* 581 580 * Always the end 582 581 */ ··· 627 622 enum rdma_nl_name_assign_type { 628 623 RDMA_NAME_ASSIGN_TYPE_UNKNOWN = 0, 629 624 RDMA_NAME_ASSIGN_TYPE_USER = 1, /* Provided by user-space */ 625 + }; 626 + 627 + /* 628 + * Supported rdma monitoring event types. 629 + */ 630 + enum rdma_nl_notify_event_type { 631 + RDMA_REGISTER_EVENT, 632 + RDMA_UNREGISTER_EVENT, 633 + RDMA_NETDEV_ATTACH_EVENT, 634 + RDMA_NETDEV_DETACH_EVENT, 630 635 }; 631 636 632 637 #endif /* _UAPI_RDMA_NETLINK_H */