Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'netkit-support-for-io_uring-zero-copy-and-af_xdp'

Daniel Borkmann says:

====================
netkit: Support for io_uring zero-copy and AF_XDP

Containers use virtual netdevs to route traffic from a physical netdev
in the host namespace. They do not have access to the physical netdev
in the host and thus can't use memory providers or AF_XDP that require
reconfiguring/restarting queues in the physical netdev.

This patchset adds the concept of queue leasing to virtual netdevs that
allow containers to use memory providers and AF_XDP at native speed.
Leased queues are bound to a real queue in a physical netdev and act
as a proxy.

Memory providers and AF_XDP operations take an ifindex and queue id,
so containers would pass in an ifindex for a virtual netdev and a queue
id of a leased queue, which then gets proxied to the underlying real
queue.

We have implemented support for this concept in netkit and tested the
latter against Nvidia ConnectX-6 (mlx5) as well as Broadcom BCM957504
(bnxt_en) 100G NICs. For more details see the individual patches.
====================

Link: https://patch.msgid.link/20260115082603.219152-1-daniel@iogearbox.net
Signed-off-by: Paolo Abeni <pabeni@redhat.com>

+1233 -117
+44
Documentation/netlink/specs/netdev.yaml
··· 339 339 doc: XSK information for this queue, if any. 340 340 type: nest 341 341 nested-attributes: xsk-info 342 + - 343 + name: lease 344 + doc: | 345 + A queue from a virtual device can have a lease which refers to 346 + another queue from a physical device. This is useful for memory 347 + providers and AF_XDP operations which take an ifindex and queue id 348 + to allow applications to bind against virtual devices in containers. 349 + type: nest 350 + nested-attributes: lease 342 351 - 343 352 name: qstats 344 353 doc: | ··· 547 538 - 548 539 name: type 549 540 - 541 + name: lease 542 + attributes: 543 + - 544 + name: ifindex 545 + doc: The netdev ifindex to lease the queue from. 546 + type: u32 547 + checks: 548 + min: 1 549 + - 550 + name: queue 551 + doc: The netdev queue to lease from. 552 + type: nest 553 + nested-attributes: queue-id 554 + - 555 + name: netns-id 556 + doc: The network namespace id of the netdev. 557 + type: s32 558 + - 550 559 name: dmabuf 551 560 attributes: 552 561 - ··· 713 686 - dmabuf 714 687 - io-uring 715 688 - xsk 689 + - lease 716 690 dump: 717 691 request: 718 692 attributes: ··· 823 795 - ifindex 824 796 - fd 825 797 reply: 798 + attributes: 799 + - id 800 + - 801 + name: queue-create 802 + doc: | 803 + Create a new queue for the given netdevice. Whether this operation 804 + is supported depends on the device and the driver. 805 + attribute-set: queue 806 + flags: [admin-perm] 807 + do: 808 + request: 809 + attributes: 810 + - ifindex 811 + - type 812 + - lease 813 + reply: &queue-create-op 826 814 attributes: 827 815 - id 828 816
+306 -54
drivers/net/netkit.c
··· 9 9 #include <linux/bpf_mprog.h> 10 10 #include <linux/indirect_call_wrapper.h> 11 11 12 + #include <net/netdev_lock.h> 13 + #include <net/netdev_queues.h> 14 + #include <net/netdev_rx_queue.h> 15 + #include <net/xdp_sock_drv.h> 12 16 #include <net/netkit.h> 13 17 #include <net/dst.h> 14 18 #include <net/tcx.h> 15 19 16 - #define DRV_NAME "netkit" 20 + #define NETKIT_DRV_NAME "netkit" 21 + 22 + #define NETKIT_NUM_RX_QUEUES_MAX 1024 23 + #define NETKIT_NUM_TX_QUEUES_MAX 1 24 + 25 + #define NETKIT_NUM_RX_QUEUES_REAL 1 26 + #define NETKIT_NUM_TX_QUEUES_REAL 1 17 27 18 28 struct netkit { 19 29 __cacheline_group_begin(netkit_fastpath); ··· 36 26 37 27 __cacheline_group_begin(netkit_slowpath); 38 28 enum netkit_mode mode; 29 + enum netkit_pairing pair; 39 30 bool primary; 40 31 u32 headroom; 41 32 __cacheline_group_end(netkit_slowpath); ··· 46 35 struct bpf_link link; 47 36 struct net_device *dev; 48 37 }; 38 + 39 + static struct rtnl_link_ops netkit_link_ops; 49 40 50 41 static __always_inline int 51 42 netkit_run(const struct bpf_mprog_entry *entry, struct sk_buff *skb, ··· 148 135 struct netkit *nk = netkit_priv(dev); 149 136 struct net_device *peer = rtnl_dereference(nk->peer); 150 137 138 + if (nk->pair == NETKIT_DEVICE_SINGLE) { 139 + netif_carrier_on(dev); 140 + return 0; 141 + } 151 142 if (!peer) 152 143 return -ENOTCONN; 153 144 if (peer->flags & IFF_UP) { ··· 236 219 stats->tx_dropped = DEV_STATS_READ(dev, tx_dropped); 237 220 } 238 221 222 + static bool netkit_xsk_supported_at_phys(const struct net_device *dev) 223 + { 224 + if (!dev->netdev_ops->ndo_bpf || 225 + !dev->netdev_ops->ndo_xdp_xmit || 226 + !dev->netdev_ops->ndo_xsk_wakeup) 227 + return false; 228 + if ((dev->xdp_features & NETDEV_XDP_ACT_XSK) != NETDEV_XDP_ACT_XSK) 229 + return false; 230 + return true; 231 + } 232 + 233 + static int netkit_xsk(struct net_device *dev, struct netdev_bpf *xdp) 234 + { 235 + struct netkit *nk = netkit_priv(dev); 236 + struct netdev_bpf xdp_lower; 237 + struct netdev_rx_queue *rxq; 238 + struct net_device *phys; 239 + int ret = -EBUSY; 240 + 241 + switch (xdp->command) { 242 + case XDP_SETUP_XSK_POOL: 243 + if (nk->pair == NETKIT_DEVICE_PAIR) 244 + return -EOPNOTSUPP; 245 + if (xdp->xsk.queue_id >= dev->real_num_rx_queues) 246 + return -EINVAL; 247 + 248 + rxq = __netif_get_rx_queue(dev, xdp->xsk.queue_id); 249 + if (!rxq->lease) 250 + return -EOPNOTSUPP; 251 + 252 + phys = rxq->lease->dev; 253 + if (!netkit_xsk_supported_at_phys(phys)) 254 + return -EOPNOTSUPP; 255 + 256 + memcpy(&xdp_lower, xdp, sizeof(xdp_lower)); 257 + xdp_lower.xsk.queue_id = get_netdev_rx_queue_index(rxq->lease); 258 + break; 259 + case XDP_SETUP_PROG: 260 + return -EPERM; 261 + default: 262 + return -EINVAL; 263 + } 264 + 265 + netdev_lock(phys); 266 + if (!dev_get_min_mp_channel_count(phys)) 267 + ret = phys->netdev_ops->ndo_bpf(phys, &xdp_lower); 268 + netdev_unlock(phys); 269 + return ret; 270 + } 271 + 272 + static int netkit_xsk_wakeup(struct net_device *dev, u32 queue_id, u32 flags) 273 + { 274 + struct netdev_rx_queue *rxq; 275 + struct net_device *phys; 276 + 277 + if (queue_id >= dev->real_num_rx_queues) 278 + return -EINVAL; 279 + 280 + rxq = __netif_get_rx_queue(dev, queue_id); 281 + if (!rxq->lease) 282 + return -EOPNOTSUPP; 283 + 284 + phys = rxq->lease->dev; 285 + if (!netkit_xsk_supported_at_phys(phys)) 286 + return -EOPNOTSUPP; 287 + 288 + return phys->netdev_ops->ndo_xsk_wakeup(phys, 289 + get_netdev_rx_queue_index(rxq->lease), flags); 290 + } 291 + 292 + static int netkit_init(struct net_device *dev) 293 + { 294 + netdev_lockdep_set_classes(dev); 295 + return 0; 296 + } 297 + 239 298 static void netkit_uninit(struct net_device *dev); 240 299 241 300 static const struct net_device_ops netkit_netdev_ops = { 301 + .ndo_init = netkit_init, 242 302 .ndo_open = netkit_open, 243 303 .ndo_stop = netkit_close, 244 304 .ndo_start_xmit = netkit_xmit, ··· 326 232 .ndo_get_peer_dev = netkit_peer_dev, 327 233 .ndo_get_stats64 = netkit_get_stats, 328 234 .ndo_uninit = netkit_uninit, 235 + .ndo_bpf = netkit_xsk, 236 + .ndo_xsk_wakeup = netkit_xsk_wakeup, 329 237 .ndo_features_check = passthru_features_check, 330 238 }; 331 239 332 240 static void netkit_get_drvinfo(struct net_device *dev, 333 241 struct ethtool_drvinfo *info) 334 242 { 335 - strscpy(info->driver, DRV_NAME, sizeof(info->driver)); 243 + strscpy(info->driver, NETKIT_DRV_NAME, sizeof(info->driver)); 336 244 } 337 245 338 246 static const struct ethtool_ops netkit_ethtool_ops = { 339 247 .get_drvinfo = netkit_get_drvinfo, 340 248 }; 249 + 250 + static int netkit_queue_create(struct net_device *dev) 251 + { 252 + struct netkit *nk = netkit_priv(dev); 253 + u32 rxq_count_old, rxq_count_new; 254 + int err; 255 + 256 + rxq_count_old = dev->real_num_rx_queues; 257 + rxq_count_new = rxq_count_old + 1; 258 + 259 + /* Only allow to lease a queue in single device mode or to 260 + * lease against the peer device which then ends up in the 261 + * target netns. 262 + */ 263 + if (nk->pair == NETKIT_DEVICE_PAIR && nk->primary) 264 + return -EOPNOTSUPP; 265 + 266 + if (netif_running(dev)) 267 + netif_carrier_off(dev); 268 + err = netif_set_real_num_rx_queues(dev, rxq_count_new); 269 + if (netif_running(dev)) 270 + netif_carrier_on(dev); 271 + 272 + return err ? : rxq_count_old; 273 + } 274 + 275 + static const struct netdev_queue_mgmt_ops netkit_queue_mgmt_ops = { 276 + .ndo_queue_create = netkit_queue_create, 277 + }; 278 + 279 + static struct net_device *netkit_alloc(struct nlattr *tb[], 280 + const char *ifname, 281 + unsigned char name_assign_type, 282 + unsigned int num_tx_queues, 283 + unsigned int num_rx_queues) 284 + { 285 + const struct rtnl_link_ops *ops = &netkit_link_ops; 286 + struct net_device *dev; 287 + 288 + if (num_tx_queues > NETKIT_NUM_TX_QUEUES_MAX || 289 + num_rx_queues > NETKIT_NUM_RX_QUEUES_MAX) 290 + return ERR_PTR(-EOPNOTSUPP); 291 + 292 + dev = alloc_netdev_mqs(ops->priv_size, ifname, 293 + name_assign_type, ops->setup, 294 + num_tx_queues, num_rx_queues); 295 + if (dev) { 296 + dev->real_num_tx_queues = NETKIT_NUM_TX_QUEUES_REAL; 297 + dev->real_num_rx_queues = NETKIT_NUM_RX_QUEUES_REAL; 298 + } 299 + return dev; 300 + } 301 + 302 + static void netkit_queue_unlease(struct net_device *dev) 303 + { 304 + struct netdev_rx_queue *rxq, *rxq_lease; 305 + struct net_device *dev_lease; 306 + int i; 307 + 308 + if (dev->real_num_rx_queues == 1) 309 + return; 310 + 311 + netdev_lock(dev); 312 + for (i = 1; i < dev->real_num_rx_queues; i++) { 313 + rxq = __netif_get_rx_queue(dev, i); 314 + rxq_lease = rxq->lease; 315 + dev_lease = rxq_lease->dev; 316 + 317 + netdev_lock(dev_lease); 318 + netdev_rx_queue_unlease(rxq, rxq_lease); 319 + netdev_unlock(dev_lease); 320 + } 321 + netdev_unlock(dev); 322 + } 341 323 342 324 static void netkit_setup(struct net_device *dev) 343 325 { ··· 445 275 dev->priv_flags |= IFF_DISABLE_NETPOLL; 446 276 dev->lltx = true; 447 277 448 - dev->ethtool_ops = &netkit_ethtool_ops; 449 - dev->netdev_ops = &netkit_netdev_ops; 278 + dev->netdev_ops = &netkit_netdev_ops; 279 + dev->ethtool_ops = &netkit_ethtool_ops; 280 + dev->queue_mgmt_ops = &netkit_queue_mgmt_ops; 450 281 451 282 dev->features |= netkit_features; 452 283 dev->hw_features = netkit_features; 453 284 dev->hw_enc_features = netkit_features; 454 285 dev->mpls_features = NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE; 455 286 dev->vlan_features = dev->features & ~netkit_features_hw_vlan; 456 - 457 287 dev->needs_free_netdev = true; 458 288 459 289 netif_set_tso_max_size(dev, GSO_MAX_SIZE); 290 + 291 + xdp_set_features_flag(dev, NETDEV_XDP_ACT_XSK); 460 292 } 461 293 462 294 static struct net *netkit_get_link_net(const struct net_device *dev) ··· 497 325 return 0; 498 326 } 499 327 500 - static struct rtnl_link_ops netkit_link_ops; 501 - 502 328 static int netkit_new_link(struct net_device *dev, 503 329 struct rtnl_newlink_params *params, 504 330 struct netlink_ext_ack *extack) ··· 505 335 enum netkit_scrub scrub_prim = NETKIT_SCRUB_DEFAULT; 506 336 enum netkit_scrub scrub_peer = NETKIT_SCRUB_DEFAULT; 507 337 struct nlattr *peer_tb[IFLA_MAX + 1], **tbp, *attr; 338 + enum netkit_pairing pair = NETKIT_DEVICE_PAIR; 508 339 enum netkit_action policy_prim = NETKIT_PASS; 509 340 enum netkit_action policy_peer = NETKIT_PASS; 510 341 struct nlattr **data = params->data; ··· 514 343 struct nlattr **tb = params->tb; 515 344 u16 headroom = 0, tailroom = 0; 516 345 struct ifinfomsg *ifmp = NULL; 517 - struct net_device *peer; 346 + struct net_device *peer = NULL; 347 + bool seen_peer = false; 518 348 char ifname[IFNAMSIZ]; 519 349 struct netkit *nk; 520 350 int err; ··· 552 380 headroom = nla_get_u16(data[IFLA_NETKIT_HEADROOM]); 553 381 if (data[IFLA_NETKIT_TAILROOM]) 554 382 tailroom = nla_get_u16(data[IFLA_NETKIT_TAILROOM]); 383 + if (data[IFLA_NETKIT_PAIRING]) 384 + pair = nla_get_u32(data[IFLA_NETKIT_PAIRING]); 385 + 386 + seen_peer = data[IFLA_NETKIT_PEER_INFO] || 387 + data[IFLA_NETKIT_PEER_SCRUB] || 388 + data[IFLA_NETKIT_PEER_POLICY]; 555 389 } 556 390 557 391 if (ifmp && tbp[IFLA_IFNAME]) { ··· 570 392 if (mode != NETKIT_L2 && 571 393 (tb[IFLA_ADDRESS] || tbp[IFLA_ADDRESS])) 572 394 return -EOPNOTSUPP; 395 + if (pair == NETKIT_DEVICE_SINGLE && 396 + (tb != tbp || seen_peer || policy_prim != NETKIT_PASS)) 397 + return -EOPNOTSUPP; 573 398 574 - peer = rtnl_create_link(peer_net, ifname, ifname_assign_type, 575 - &netkit_link_ops, tbp, extack); 576 - if (IS_ERR(peer)) 577 - return PTR_ERR(peer); 399 + if (pair == NETKIT_DEVICE_PAIR) { 400 + peer = rtnl_create_link(peer_net, ifname, ifname_assign_type, 401 + &netkit_link_ops, tbp, extack); 402 + if (IS_ERR(peer)) 403 + return PTR_ERR(peer); 578 404 579 - netif_inherit_tso_max(peer, dev); 580 - if (headroom) { 581 - peer->needed_headroom = headroom; 582 - dev->needed_headroom = headroom; 405 + netif_inherit_tso_max(peer, dev); 406 + if (headroom) 407 + peer->needed_headroom = headroom; 408 + if (tailroom) 409 + peer->needed_tailroom = tailroom; 410 + if (mode == NETKIT_L2 && !(ifmp && tbp[IFLA_ADDRESS])) 411 + eth_hw_addr_random(peer); 412 + if (ifmp && dev->ifindex) 413 + peer->ifindex = ifmp->ifi_index; 414 + 415 + nk = netkit_priv(peer); 416 + nk->primary = false; 417 + nk->policy = policy_peer; 418 + nk->scrub = scrub_peer; 419 + nk->mode = mode; 420 + nk->pair = pair; 421 + nk->headroom = headroom; 422 + bpf_mprog_bundle_init(&nk->bundle); 423 + 424 + err = register_netdevice(peer); 425 + if (err < 0) 426 + goto err_register_peer; 427 + netif_carrier_off(peer); 428 + if (mode == NETKIT_L2) 429 + dev_change_flags(peer, peer->flags & ~IFF_NOARP, NULL); 430 + 431 + err = rtnl_configure_link(peer, NULL, 0, NULL); 432 + if (err < 0) 433 + goto err_configure_peer; 583 434 } 584 - if (tailroom) { 585 - peer->needed_tailroom = tailroom; 586 - dev->needed_tailroom = tailroom; 587 - } 588 - 589 - if (mode == NETKIT_L2 && !(ifmp && tbp[IFLA_ADDRESS])) 590 - eth_hw_addr_random(peer); 591 - if (ifmp && dev->ifindex) 592 - peer->ifindex = ifmp->ifi_index; 593 - 594 - nk = netkit_priv(peer); 595 - nk->primary = false; 596 - nk->policy = policy_peer; 597 - nk->scrub = scrub_peer; 598 - nk->mode = mode; 599 - nk->headroom = headroom; 600 - bpf_mprog_bundle_init(&nk->bundle); 601 - 602 - err = register_netdevice(peer); 603 - if (err < 0) 604 - goto err_register_peer; 605 - netif_carrier_off(peer); 606 - if (mode == NETKIT_L2) 607 - dev_change_flags(peer, peer->flags & ~IFF_NOARP, NULL); 608 - 609 - err = rtnl_configure_link(peer, NULL, 0, NULL); 610 - if (err < 0) 611 - goto err_configure_peer; 612 435 613 436 if (mode == NETKIT_L2 && !tb[IFLA_ADDRESS]) 614 437 eth_hw_addr_random(dev); ··· 617 438 nla_strscpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ); 618 439 else 619 440 strscpy(dev->name, "nk%d", IFNAMSIZ); 441 + if (headroom) 442 + dev->needed_headroom = headroom; 443 + if (tailroom) 444 + dev->needed_tailroom = tailroom; 620 445 621 446 nk = netkit_priv(dev); 622 447 nk->primary = true; 623 448 nk->policy = policy_prim; 624 449 nk->scrub = scrub_prim; 625 450 nk->mode = mode; 451 + nk->pair = pair; 626 452 nk->headroom = headroom; 627 453 bpf_mprog_bundle_init(&nk->bundle); 628 454 ··· 639 455 dev_change_flags(dev, dev->flags & ~IFF_NOARP, NULL); 640 456 641 457 rcu_assign_pointer(netkit_priv(dev)->peer, peer); 642 - rcu_assign_pointer(netkit_priv(peer)->peer, dev); 458 + if (peer) 459 + rcu_assign_pointer(netkit_priv(peer)->peer, dev); 643 460 return 0; 644 461 err_configure_peer: 645 - unregister_netdevice(peer); 462 + if (peer) 463 + unregister_netdevice(peer); 646 464 return err; 647 465 err_register_peer: 648 466 free_netdev(peer); ··· 704 518 nk = netkit_priv(dev); 705 519 if (!nk->primary) 706 520 return ERR_PTR(-EACCES); 521 + if (nk->pair == NETKIT_DEVICE_SINGLE) 522 + return ERR_PTR(-EOPNOTSUPP); 707 523 if (which == BPF_NETKIT_PEER) { 708 524 dev = rcu_dereference_rtnl(nk->peer); 709 525 if (!dev) ··· 1032 844 static void netkit_uninit(struct net_device *dev) 1033 845 { 1034 846 netkit_release_all(dev); 847 + netkit_queue_unlease(dev); 1035 848 } 1036 849 1037 850 static void netkit_del_link(struct net_device *dev, struct list_head *head) ··· 1068 879 { IFLA_NETKIT_PEER_INFO, "peer info" }, 1069 880 { IFLA_NETKIT_HEADROOM, "headroom" }, 1070 881 { IFLA_NETKIT_TAILROOM, "tailroom" }, 882 + { IFLA_NETKIT_PAIRING, "pairing" }, 1071 883 }; 1072 884 1073 885 if (!nk->primary) { ··· 1088 898 } 1089 899 1090 900 if (data[IFLA_NETKIT_POLICY]) { 901 + err = -EOPNOTSUPP; 1091 902 attr = data[IFLA_NETKIT_POLICY]; 1092 903 policy = nla_get_u32(attr); 1093 - err = netkit_check_policy(policy, attr, extack); 904 + if (nk->pair == NETKIT_DEVICE_PAIR) 905 + err = netkit_check_policy(policy, attr, extack); 1094 906 if (err) 1095 907 return err; 1096 908 WRITE_ONCE(nk->policy, policy); ··· 1113 921 return 0; 1114 922 } 1115 923 924 + static void netkit_check_lease_unregister(struct net_device *dev) 925 + { 926 + LIST_HEAD(list_kill); 927 + u32 q_idx; 928 + 929 + if (READ_ONCE(dev->reg_state) != NETREG_UNREGISTERING || 930 + !dev->dev.parent) 931 + return; 932 + 933 + netdev_lock_ops(dev); 934 + for (q_idx = 0; q_idx < dev->real_num_rx_queues; q_idx++) { 935 + struct net_device *tmp = dev; 936 + u32 tmp_q_idx = q_idx; 937 + 938 + if (netif_rx_queue_lease_get_owner(&tmp, &tmp_q_idx)) { 939 + if (tmp->netdev_ops != &netkit_netdev_ops) 940 + continue; 941 + /* A single phys device can have multiple queues leased 942 + * to one netkit device. We can only queue that netkit 943 + * device once to the list_kill. Queues of that phys 944 + * device can be leased with different individual netkit 945 + * devices, hence we batch via list_kill. 946 + */ 947 + if (unregister_netdevice_queued(tmp)) 948 + continue; 949 + netkit_del_link(tmp, &list_kill); 950 + } 951 + } 952 + netdev_unlock_ops(dev); 953 + unregister_netdevice_many(&list_kill); 954 + } 955 + 956 + static int netkit_notifier(struct notifier_block *this, 957 + unsigned long event, void *ptr) 958 + { 959 + struct net_device *dev = netdev_notifier_info_to_dev(ptr); 960 + 961 + if (event == NETDEV_UNREGISTER) 962 + netkit_check_lease_unregister(dev); 963 + return NOTIFY_DONE; 964 + } 965 + 1116 966 static size_t netkit_get_size(const struct net_device *dev) 1117 967 { 1118 968 return nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_POLICY */ ··· 1165 931 nla_total_size(sizeof(u8)) + /* IFLA_NETKIT_PRIMARY */ 1166 932 nla_total_size(sizeof(u16)) + /* IFLA_NETKIT_HEADROOM */ 1167 933 nla_total_size(sizeof(u16)) + /* IFLA_NETKIT_TAILROOM */ 934 + nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_PAIRING */ 1168 935 0; 1169 936 } 1170 937 ··· 1185 950 if (nla_put_u16(skb, IFLA_NETKIT_HEADROOM, dev->needed_headroom)) 1186 951 return -EMSGSIZE; 1187 952 if (nla_put_u16(skb, IFLA_NETKIT_TAILROOM, dev->needed_tailroom)) 953 + return -EMSGSIZE; 954 + if (nla_put_u32(skb, IFLA_NETKIT_PAIRING, nk->pair)) 1188 955 return -EMSGSIZE; 1189 956 1190 957 if (peer) { ··· 1209 972 [IFLA_NETKIT_TAILROOM] = { .type = NLA_U16 }, 1210 973 [IFLA_NETKIT_SCRUB] = NLA_POLICY_MAX(NLA_U32, NETKIT_SCRUB_DEFAULT), 1211 974 [IFLA_NETKIT_PEER_SCRUB] = NLA_POLICY_MAX(NLA_U32, NETKIT_SCRUB_DEFAULT), 975 + [IFLA_NETKIT_PAIRING] = NLA_POLICY_MAX(NLA_U32, NETKIT_DEVICE_SINGLE), 1212 976 [IFLA_NETKIT_PRIMARY] = { .type = NLA_REJECT, 1213 977 .reject_message = "Primary attribute is read-only" }, 1214 978 }; 1215 979 1216 980 static struct rtnl_link_ops netkit_link_ops = { 1217 - .kind = DRV_NAME, 981 + .kind = NETKIT_DRV_NAME, 1218 982 .priv_size = sizeof(struct netkit), 983 + .alloc = netkit_alloc, 1219 984 .setup = netkit_setup, 1220 985 .newlink = netkit_new_link, 1221 986 .dellink = netkit_del_link, ··· 1231 992 .maxtype = IFLA_NETKIT_MAX, 1232 993 }; 1233 994 1234 - static __init int netkit_init(void) 995 + static struct notifier_block netkit_netdev_notifier = { 996 + .notifier_call = netkit_notifier, 997 + }; 998 + 999 + static __init int netkit_mod_init(void) 1235 1000 { 1001 + int ret; 1002 + 1236 1003 BUILD_BUG_ON((int)NETKIT_NEXT != (int)TCX_NEXT || 1237 1004 (int)NETKIT_PASS != (int)TCX_PASS || 1238 1005 (int)NETKIT_DROP != (int)TCX_DROP || 1239 1006 (int)NETKIT_REDIRECT != (int)TCX_REDIRECT); 1240 1007 1241 - return rtnl_link_register(&netkit_link_ops); 1008 + ret = rtnl_link_register(&netkit_link_ops); 1009 + if (ret) 1010 + return ret; 1011 + ret = register_netdevice_notifier(&netkit_netdev_notifier); 1012 + if (ret) 1013 + rtnl_link_unregister(&netkit_link_ops); 1014 + return ret; 1242 1015 } 1243 1016 1244 - static __exit void netkit_exit(void) 1017 + static __exit void netkit_mod_exit(void) 1245 1018 { 1019 + unregister_netdevice_notifier(&netkit_netdev_notifier); 1246 1020 rtnl_link_unregister(&netkit_link_ops); 1247 1021 } 1248 1022 1249 - module_init(netkit_init); 1250 - module_exit(netkit_exit); 1023 + module_init(netkit_mod_init); 1024 + module_exit(netkit_mod_exit); 1251 1025 1252 1026 MODULE_DESCRIPTION("BPF-programmable network device"); 1253 1027 MODULE_AUTHOR("Daniel Borkmann <daniel@iogearbox.net>"); 1254 1028 MODULE_AUTHOR("Nikolay Aleksandrov <razor@blackwall.org>"); 1255 1029 MODULE_LICENSE("GPL"); 1256 - MODULE_ALIAS_RTNL_LINK(DRV_NAME); 1030 + MODULE_ALIAS_RTNL_LINK(NETKIT_DRV_NAME);
+6
include/linux/netdevice.h
··· 3400 3400 int register_netdevice(struct net_device *dev); 3401 3401 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head); 3402 3402 void unregister_netdevice_many(struct list_head *head); 3403 + 3403 3404 static inline void unregister_netdevice(struct net_device *dev) 3404 3405 { 3405 3406 unregister_netdevice_queue(dev, NULL); 3407 + } 3408 + 3409 + static inline bool unregister_netdevice_queued(const struct net_device *dev) 3410 + { 3411 + return !list_empty(&dev->unreg_list); 3406 3412 } 3407 3413 3408 3414 int netdev_refcnt_read(const struct net_device *dev);
+16 -3
include/net/netdev_queues.h
··· 130 130 * @ndo_queue_get_dma_dev: Get dma device for zero-copy operations to be used 131 131 * for this queue. Return NULL on error. 132 132 * 133 + * @ndo_queue_create: Create a new RX queue which can be leased to another queue. 134 + * Ops on this queue are redirected to the leased queue e.g. 135 + * when opening a memory provider. Return the new queue id on 136 + * success. Return negative error code on failure. 137 + * 133 138 * Note that @ndo_queue_mem_alloc and @ndo_queue_mem_free may be called while 134 139 * the interface is closed. @ndo_queue_start and @ndo_queue_stop will only 135 140 * be called for an interface which is open. ··· 154 149 int idx); 155 150 struct device * (*ndo_queue_get_dma_dev)(struct net_device *dev, 156 151 int idx); 152 + int (*ndo_queue_create)(struct net_device *dev); 157 153 }; 158 154 159 - bool netif_rxq_has_unreadable_mp(struct net_device *dev, int idx); 155 + bool netif_rxq_has_unreadable_mp(struct net_device *dev, unsigned int rxq_idx); 156 + bool netif_rxq_has_mp(struct net_device *dev, unsigned int rxq_idx); 157 + bool netif_rxq_is_leased(struct net_device *dev, unsigned int rxq_idx); 160 158 161 159 /** 162 160 * DOC: Lockless queue stopping / waking helpers. ··· 348 340 }) 349 341 350 342 struct device *netdev_queue_get_dma_dev(struct net_device *dev, int idx); 351 - 352 - #endif 343 + bool netdev_can_create_queue(const struct net_device *dev, 344 + struct netlink_ext_ack *extack); 345 + bool netdev_can_lease_queue(const struct net_device *dev, 346 + struct netlink_ext_ack *extack); 347 + bool netdev_queue_busy(struct net_device *dev, int idx, 348 + struct netlink_ext_ack *extack); 349 + #endif /* _LINUX_NET_QUEUES_H */
+20 -1
include/net/netdev_rx_queue.h
··· 28 28 #endif 29 29 struct napi_struct *napi; 30 30 struct pp_memory_provider_params mp_params; 31 + struct netdev_rx_queue *lease; 32 + netdevice_tracker lease_tracker; 31 33 } ____cacheline_aligned_in_smp; 32 34 33 35 /* ··· 59 57 } 60 58 61 59 int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq); 60 + void netdev_rx_queue_lease(struct netdev_rx_queue *rxq_dst, 61 + struct netdev_rx_queue *rxq_src); 62 + void netdev_rx_queue_unlease(struct netdev_rx_queue *rxq_dst, 63 + struct netdev_rx_queue *rxq_src); 64 + bool netif_rx_queue_lease_get_owner(struct net_device **dev, unsigned int *rxq); 62 65 63 - #endif 66 + enum netif_lease_dir { 67 + NETIF_VIRT_TO_PHYS, 68 + NETIF_PHYS_TO_VIRT, 69 + }; 70 + 71 + struct netdev_rx_queue * 72 + __netif_get_rx_queue_lease(struct net_device **dev, unsigned int *rxq, 73 + enum netif_lease_dir dir); 74 + struct netdev_rx_queue * 75 + netif_get_rx_queue_lease_locked(struct net_device **dev, unsigned int *rxq); 76 + void netif_put_rx_queue_lease_locked(struct net_device *orig_dev, 77 + struct net_device *dev); 78 + #endif /* _LINUX_NETDEV_RX_QUEUE_H */
+2 -2
include/net/page_pool/memory_provider.h
··· 23 23 void net_mp_niov_set_page_pool(struct page_pool *pool, struct net_iov *niov); 24 24 void net_mp_niov_clear_page_pool(struct net_iov *niov); 25 25 26 - int net_mp_open_rxq(struct net_device *dev, unsigned ifq_idx, 26 + int net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx, 27 27 struct pp_memory_provider_params *p); 28 28 int __net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx, 29 29 const struct pp_memory_provider_params *p, 30 30 struct netlink_ext_ack *extack); 31 - void net_mp_close_rxq(struct net_device *dev, unsigned ifq_idx, 31 + void net_mp_close_rxq(struct net_device *dev, unsigned int rxq_idx, 32 32 struct pp_memory_provider_params *old_p); 33 33 void __net_mp_close_rxq(struct net_device *dev, unsigned int rxq_idx, 34 34 const struct pp_memory_provider_params *old_p);
+1 -1
include/net/xdp_sock_drv.h
··· 28 28 bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc); 29 29 u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 max); 30 30 void xsk_tx_release(struct xsk_buff_pool *pool); 31 - struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev, 31 + struct xsk_buff_pool *xsk_get_pool_from_qid(const struct net_device *dev, 32 32 u16 queue_id); 33 33 void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool); 34 34 void xsk_set_tx_need_wakeup(struct xsk_buff_pool *pool);
+6
include/uapi/linux/if_link.h
··· 1296 1296 NETKIT_L3, 1297 1297 }; 1298 1298 1299 + enum netkit_pairing { 1300 + NETKIT_DEVICE_PAIR, 1301 + NETKIT_DEVICE_SINGLE, 1302 + }; 1303 + 1299 1304 /* NETKIT_SCRUB_NONE leaves clearing skb->{mark,priority} up to 1300 1305 * the BPF program if attached. This also means the latter can 1301 1306 * consume the two fields if they were populated earlier. ··· 1325 1320 IFLA_NETKIT_PEER_SCRUB, 1326 1321 IFLA_NETKIT_HEADROOM, 1327 1322 IFLA_NETKIT_TAILROOM, 1323 + IFLA_NETKIT_PAIRING, 1328 1324 __IFLA_NETKIT_MAX, 1329 1325 }; 1330 1326 #define IFLA_NETKIT_MAX (__IFLA_NETKIT_MAX - 1)
+11
include/uapi/linux/netdev.h
··· 160 160 NETDEV_A_QUEUE_DMABUF, 161 161 NETDEV_A_QUEUE_IO_URING, 162 162 NETDEV_A_QUEUE_XSK, 163 + NETDEV_A_QUEUE_LEASE, 163 164 164 165 __NETDEV_A_QUEUE_MAX, 165 166 NETDEV_A_QUEUE_MAX = (__NETDEV_A_QUEUE_MAX - 1) ··· 204 203 }; 205 204 206 205 enum { 206 + NETDEV_A_LEASE_IFINDEX = 1, 207 + NETDEV_A_LEASE_QUEUE, 208 + NETDEV_A_LEASE_NETNS_ID, 209 + 210 + __NETDEV_A_LEASE_MAX, 211 + NETDEV_A_LEASE_MAX = (__NETDEV_A_LEASE_MAX - 1) 212 + }; 213 + 214 + enum { 207 215 NETDEV_A_DMABUF_IFINDEX = 1, 208 216 NETDEV_A_DMABUF_QUEUES, 209 217 NETDEV_A_DMABUF_FD, ··· 238 228 NETDEV_CMD_BIND_RX, 239 229 NETDEV_CMD_NAPI_SET, 240 230 NETDEV_CMD_BIND_TX, 231 + NETDEV_CMD_QUEUE_CREATE, 241 232 242 233 __NETDEV_CMD_MAX, 243 234 NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1)
+7
net/core/dev.c
··· 1115 1115 } 1116 1116 1117 1117 struct net_device * 1118 + netdev_put_lock(struct net_device *dev, netdevice_tracker *tracker) 1119 + { 1120 + netdev_tracker_free(dev, tracker); 1121 + return __netdev_put_lock(dev, dev_net(dev)); 1122 + } 1123 + 1124 + struct net_device * 1118 1125 netdev_xa_find_lock(struct net *net, struct net_device *dev, 1119 1126 unsigned long *index) 1120 1127 {
+2
net/core/dev.h
··· 30 30 struct net_device *dev_get_by_napi_id(unsigned int napi_id); 31 31 32 32 struct net_device *__netdev_put_lock(struct net_device *dev, struct net *net); 33 + struct net_device *netdev_put_lock(struct net_device *dev, 34 + netdevice_tracker *tracker); 33 35 struct net_device * 34 36 netdev_xa_find_lock(struct net *net, struct net_device *dev, 35 37 unsigned long *index);
+20
net/core/netdev-genl-gen.c
··· 28 28 }; 29 29 30 30 /* Common nested types */ 31 + const struct nla_policy netdev_lease_nl_policy[NETDEV_A_LEASE_NETNS_ID + 1] = { 32 + [NETDEV_A_LEASE_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), 33 + [NETDEV_A_LEASE_QUEUE] = NLA_POLICY_NESTED(netdev_queue_id_nl_policy), 34 + [NETDEV_A_LEASE_NETNS_ID] = { .type = NLA_S32, }, 35 + }; 36 + 31 37 const struct nla_policy netdev_page_pool_info_nl_policy[NETDEV_A_PAGE_POOL_IFINDEX + 1] = { 32 38 [NETDEV_A_PAGE_POOL_ID] = NLA_POLICY_FULL_RANGE(NLA_UINT, &netdev_a_page_pool_id_range), 33 39 [NETDEV_A_PAGE_POOL_IFINDEX] = NLA_POLICY_FULL_RANGE(NLA_U32, &netdev_a_page_pool_ifindex_range), ··· 111 105 static const struct nla_policy netdev_bind_tx_nl_policy[NETDEV_A_DMABUF_FD + 1] = { 112 106 [NETDEV_A_DMABUF_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), 113 107 [NETDEV_A_DMABUF_FD] = { .type = NLA_U32, }, 108 + }; 109 + 110 + /* NETDEV_CMD_QUEUE_CREATE - do */ 111 + static const struct nla_policy netdev_queue_create_nl_policy[NETDEV_A_QUEUE_LEASE + 1] = { 112 + [NETDEV_A_QUEUE_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), 113 + [NETDEV_A_QUEUE_TYPE] = NLA_POLICY_MAX(NLA_U32, 1), 114 + [NETDEV_A_QUEUE_LEASE] = NLA_POLICY_NESTED(netdev_lease_nl_policy), 114 115 }; 115 116 116 117 /* Ops table for netdev */ ··· 217 204 .policy = netdev_bind_tx_nl_policy, 218 205 .maxattr = NETDEV_A_DMABUF_FD, 219 206 .flags = GENL_CMD_CAP_DO, 207 + }, 208 + { 209 + .cmd = NETDEV_CMD_QUEUE_CREATE, 210 + .doit = netdev_nl_queue_create_doit, 211 + .policy = netdev_queue_create_nl_policy, 212 + .maxattr = NETDEV_A_QUEUE_LEASE, 213 + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, 220 214 }, 221 215 }; 222 216
+2
net/core/netdev-genl-gen.h
··· 14 14 #include <net/netdev_netlink.h> 15 15 16 16 /* Common nested types */ 17 + extern const struct nla_policy netdev_lease_nl_policy[NETDEV_A_LEASE_NETNS_ID + 1]; 17 18 extern const struct nla_policy netdev_page_pool_info_nl_policy[NETDEV_A_PAGE_POOL_IFINDEX + 1]; 18 19 extern const struct nla_policy netdev_queue_id_nl_policy[NETDEV_A_QUEUE_TYPE + 1]; 19 20 ··· 37 36 int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info); 38 37 int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info); 39 38 int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info); 39 + int netdev_nl_queue_create_doit(struct sk_buff *skb, struct genl_info *info); 40 40 41 41 enum { 42 42 NETDEV_NLGRP_MGMT,
+185
net/core/netdev-genl.c
··· 391 391 u32 q_idx, u32 q_type, const struct genl_info *info) 392 392 { 393 393 struct pp_memory_provider_params *params; 394 + struct net_device *orig_netdev = netdev; 395 + struct nlattr *nest_lease, *nest_queue; 394 396 struct netdev_rx_queue *rxq; 395 397 struct netdev_queue *txq; 398 + u32 lease_q_idx = q_idx; 396 399 void *hdr; 397 400 398 401 hdr = genlmsg_iput(rsp, info); ··· 412 409 rxq = __netif_get_rx_queue(netdev, q_idx); 413 410 if (nla_put_napi_id(rsp, rxq->napi)) 414 411 goto nla_put_failure; 412 + 413 + if (netif_rx_queue_lease_get_owner(&netdev, &lease_q_idx)) { 414 + struct net *net, *peer_net; 415 + 416 + nest_lease = nla_nest_start(rsp, NETDEV_A_QUEUE_LEASE); 417 + if (!nest_lease) 418 + goto nla_put_failure; 419 + nest_queue = nla_nest_start(rsp, NETDEV_A_LEASE_QUEUE); 420 + if (!nest_queue) 421 + goto nla_put_failure; 422 + if (nla_put_u32(rsp, NETDEV_A_QUEUE_ID, lease_q_idx)) 423 + goto nla_put_failure; 424 + if (nla_put_u32(rsp, NETDEV_A_QUEUE_TYPE, q_type)) 425 + goto nla_put_failure; 426 + nla_nest_end(rsp, nest_queue); 427 + if (nla_put_u32(rsp, NETDEV_A_LEASE_IFINDEX, 428 + READ_ONCE(netdev->ifindex))) 429 + goto nla_put_failure; 430 + rcu_read_lock(); 431 + peer_net = dev_net_rcu(netdev); 432 + net = dev_net_rcu(orig_netdev); 433 + if (!net_eq(net, peer_net)) { 434 + s32 id = peernet2id_alloc(net, peer_net, GFP_ATOMIC); 435 + 436 + if (nla_put_s32(rsp, NETDEV_A_LEASE_NETNS_ID, id)) 437 + goto nla_put_failure_unlock; 438 + } 439 + rcu_read_unlock(); 440 + nla_nest_end(rsp, nest_lease); 441 + netdev = orig_netdev; 442 + } 415 443 416 444 params = &rxq->mp_params; 417 445 if (params->mp_ops && ··· 471 437 472 438 return 0; 473 439 440 + nla_put_failure_unlock: 441 + rcu_read_unlock(); 474 442 nla_put_failure: 475 443 genlmsg_cancel(rsp, hdr); 476 444 return -EMSGSIZE; ··· 1151 1115 netdev_unlock(netdev); 1152 1116 err_unlock_sock: 1153 1117 mutex_unlock(&priv->lock); 1118 + err_genlmsg_free: 1119 + nlmsg_free(rsp); 1120 + return err; 1121 + } 1122 + 1123 + int netdev_nl_queue_create_doit(struct sk_buff *skb, struct genl_info *info) 1124 + { 1125 + const int qmaxtype = ARRAY_SIZE(netdev_queue_id_nl_policy) - 1; 1126 + const int lmaxtype = ARRAY_SIZE(netdev_lease_nl_policy) - 1; 1127 + int err, ifindex, ifindex_lease, queue_id, queue_id_lease; 1128 + struct nlattr *qtb[ARRAY_SIZE(netdev_queue_id_nl_policy)]; 1129 + struct nlattr *ltb[ARRAY_SIZE(netdev_lease_nl_policy)]; 1130 + struct netdev_rx_queue *rxq, *rxq_lease; 1131 + struct net_device *dev, *dev_lease; 1132 + netdevice_tracker dev_tracker; 1133 + struct nlattr *nest; 1134 + struct sk_buff *rsp; 1135 + void *hdr; 1136 + 1137 + if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_IFINDEX) || 1138 + GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_TYPE) || 1139 + GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_LEASE)) 1140 + return -EINVAL; 1141 + if (nla_get_u32(info->attrs[NETDEV_A_QUEUE_TYPE]) != 1142 + NETDEV_QUEUE_TYPE_RX) { 1143 + NL_SET_BAD_ATTR(info->extack, info->attrs[NETDEV_A_QUEUE_TYPE]); 1144 + return -EINVAL; 1145 + } 1146 + 1147 + ifindex = nla_get_u32(info->attrs[NETDEV_A_QUEUE_IFINDEX]); 1148 + 1149 + nest = info->attrs[NETDEV_A_QUEUE_LEASE]; 1150 + err = nla_parse_nested(ltb, lmaxtype, nest, 1151 + netdev_lease_nl_policy, info->extack); 1152 + if (err < 0) 1153 + return err; 1154 + if (NL_REQ_ATTR_CHECK(info->extack, nest, ltb, NETDEV_A_LEASE_IFINDEX) || 1155 + NL_REQ_ATTR_CHECK(info->extack, nest, ltb, NETDEV_A_LEASE_QUEUE)) 1156 + return -EINVAL; 1157 + if (ltb[NETDEV_A_LEASE_NETNS_ID]) { 1158 + NL_SET_BAD_ATTR(info->extack, ltb[NETDEV_A_LEASE_NETNS_ID]); 1159 + return -EINVAL; 1160 + } 1161 + 1162 + ifindex_lease = nla_get_u32(ltb[NETDEV_A_LEASE_IFINDEX]); 1163 + 1164 + nest = ltb[NETDEV_A_LEASE_QUEUE]; 1165 + err = nla_parse_nested(qtb, qmaxtype, nest, 1166 + netdev_queue_id_nl_policy, info->extack); 1167 + if (err < 0) 1168 + return err; 1169 + if (NL_REQ_ATTR_CHECK(info->extack, nest, qtb, NETDEV_A_QUEUE_ID) || 1170 + NL_REQ_ATTR_CHECK(info->extack, nest, qtb, NETDEV_A_QUEUE_TYPE)) 1171 + return -EINVAL; 1172 + if (nla_get_u32(qtb[NETDEV_A_QUEUE_TYPE]) != NETDEV_QUEUE_TYPE_RX) { 1173 + NL_SET_BAD_ATTR(info->extack, qtb[NETDEV_A_QUEUE_TYPE]); 1174 + return -EINVAL; 1175 + } 1176 + if (ifindex == ifindex_lease) { 1177 + NL_SET_ERR_MSG(info->extack, 1178 + "Lease ifindex cannot be the same as queue creation ifindex"); 1179 + return -EINVAL; 1180 + } 1181 + 1182 + queue_id_lease = nla_get_u32(qtb[NETDEV_A_QUEUE_ID]); 1183 + 1184 + rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); 1185 + if (!rsp) 1186 + return -ENOMEM; 1187 + 1188 + hdr = genlmsg_iput(rsp, info); 1189 + if (!hdr) { 1190 + err = -EMSGSIZE; 1191 + goto err_genlmsg_free; 1192 + } 1193 + 1194 + /* Locking order is always from the virtual to the physical device 1195 + * since this is also the same order when applications open the 1196 + * memory provider later on. 1197 + */ 1198 + dev = netdev_get_by_index_lock(genl_info_net(info), ifindex); 1199 + if (!dev) { 1200 + err = -ENODEV; 1201 + goto err_genlmsg_free; 1202 + } 1203 + if (!netdev_can_create_queue(dev, info->extack)) { 1204 + err = -EINVAL; 1205 + goto err_unlock_dev; 1206 + } 1207 + 1208 + dev_lease = netdev_get_by_index(genl_info_net(info), ifindex_lease, 1209 + &dev_tracker, GFP_KERNEL); 1210 + if (!dev_lease) { 1211 + err = -ENODEV; 1212 + goto err_unlock_dev; 1213 + } 1214 + if (!netdev_can_lease_queue(dev_lease, info->extack)) { 1215 + netdev_put(dev_lease, &dev_tracker); 1216 + err = -EINVAL; 1217 + goto err_unlock_dev; 1218 + } 1219 + 1220 + dev_lease = netdev_put_lock(dev_lease, &dev_tracker); 1221 + if (!dev_lease) { 1222 + err = -ENODEV; 1223 + goto err_unlock_dev; 1224 + } 1225 + if (queue_id_lease >= dev_lease->real_num_rx_queues) { 1226 + err = -ERANGE; 1227 + NL_SET_BAD_ATTR(info->extack, qtb[NETDEV_A_QUEUE_ID]); 1228 + goto err_unlock_dev_lease; 1229 + } 1230 + if (netdev_queue_busy(dev_lease, queue_id_lease, info->extack)) { 1231 + err = -EBUSY; 1232 + goto err_unlock_dev_lease; 1233 + } 1234 + 1235 + rxq_lease = __netif_get_rx_queue(dev_lease, queue_id_lease); 1236 + rxq = __netif_get_rx_queue(dev, dev->real_num_rx_queues - 1); 1237 + 1238 + if (rxq->lease && rxq->lease->dev != dev_lease) { 1239 + err = -EOPNOTSUPP; 1240 + NL_SET_ERR_MSG(info->extack, 1241 + "Leasing multiple queues from different devices not supported"); 1242 + goto err_unlock_dev_lease; 1243 + } 1244 + 1245 + err = queue_id = dev->queue_mgmt_ops->ndo_queue_create(dev); 1246 + if (err < 0) { 1247 + NL_SET_ERR_MSG(info->extack, 1248 + "Device is unable to create a new queue"); 1249 + goto err_unlock_dev_lease; 1250 + } 1251 + 1252 + rxq = __netif_get_rx_queue(dev, queue_id); 1253 + netdev_rx_queue_lease(rxq, rxq_lease); 1254 + 1255 + nla_put_u32(rsp, NETDEV_A_QUEUE_ID, queue_id); 1256 + genlmsg_end(rsp, hdr); 1257 + 1258 + netdev_unlock(dev_lease); 1259 + netdev_unlock(dev); 1260 + 1261 + return genlmsg_reply(rsp, info); 1262 + 1263 + err_unlock_dev_lease: 1264 + netdev_unlock(dev_lease); 1265 + err_unlock_dev: 1266 + netdev_unlock(dev); 1154 1267 err_genlmsg_free: 1155 1268 nlmsg_free(rsp); 1156 1269 return err;
+72 -2
net/core/netdev_queues.c
··· 1 1 // SPDX-License-Identifier: GPL-2.0-or-later 2 2 3 3 #include <net/netdev_queues.h> 4 + #include <net/netdev_rx_queue.h> 5 + #include <net/xdp_sock_drv.h> 4 6 5 7 /** 6 8 * netdev_queue_get_dma_dev() - get dma device for zero-copy operations 7 9 * @dev: net_device 8 10 * @idx: queue index 9 11 * 10 - * Get dma device for zero-copy operations to be used for this queue. 12 + * Get dma device for zero-copy operations to be used for this queue. If the 13 + * queue is leased to a physical queue, we retrieve the latter's dma device. 11 14 * When such device is not available or valid, the function will return NULL. 12 15 * 13 16 * Return: Device or NULL on error 14 17 */ 15 18 struct device *netdev_queue_get_dma_dev(struct net_device *dev, int idx) 16 19 { 17 - const struct netdev_queue_mgmt_ops *queue_ops = dev->queue_mgmt_ops; 20 + const struct netdev_queue_mgmt_ops *queue_ops; 18 21 struct device *dma_dev; 22 + 23 + if (idx < dev->real_num_rx_queues) { 24 + struct netdev_rx_queue *rxq = __netif_get_rx_queue(dev, idx); 25 + 26 + if (rxq->lease) { 27 + rxq = rxq->lease; 28 + dev = rxq->dev; 29 + idx = get_netdev_rx_queue_index(rxq); 30 + } 31 + } 32 + 33 + queue_ops = dev->queue_mgmt_ops; 19 34 20 35 if (queue_ops && queue_ops->ndo_queue_get_dma_dev) 21 36 dma_dev = queue_ops->ndo_queue_get_dma_dev(dev, idx); ··· 40 25 return dma_dev && dma_dev->dma_mask ? dma_dev : NULL; 41 26 } 42 27 28 + bool netdev_can_create_queue(const struct net_device *dev, 29 + struct netlink_ext_ack *extack) 30 + { 31 + if (dev->dev.parent) { 32 + NL_SET_ERR_MSG(extack, "Device is not a virtual device"); 33 + return false; 34 + } 35 + if (!dev->queue_mgmt_ops || 36 + !dev->queue_mgmt_ops->ndo_queue_create) { 37 + NL_SET_ERR_MSG(extack, "Device does not support queue creation"); 38 + return false; 39 + } 40 + if (dev->real_num_rx_queues < 1 || 41 + dev->real_num_tx_queues < 1) { 42 + NL_SET_ERR_MSG(extack, "Device must have at least one real queue"); 43 + return false; 44 + } 45 + return true; 46 + } 47 + 48 + bool netdev_can_lease_queue(const struct net_device *dev, 49 + struct netlink_ext_ack *extack) 50 + { 51 + if (!dev->dev.parent) { 52 + NL_SET_ERR_MSG(extack, "Lease device is a virtual device"); 53 + return false; 54 + } 55 + if (!netif_device_present(dev)) { 56 + NL_SET_ERR_MSG(extack, "Lease device has been removed from the system"); 57 + return false; 58 + } 59 + if (!dev->queue_mgmt_ops) { 60 + NL_SET_ERR_MSG(extack, "Lease device does not support queue management operations"); 61 + return false; 62 + } 63 + return true; 64 + } 65 + 66 + bool netdev_queue_busy(struct net_device *dev, int idx, 67 + struct netlink_ext_ack *extack) 68 + { 69 + if (netif_rxq_is_leased(dev, idx)) { 70 + NL_SET_ERR_MSG(extack, "Lease device queue is already leased"); 71 + return true; 72 + } 73 + if (xsk_get_pool_from_qid(dev, idx)) { 74 + NL_SET_ERR_MSG(extack, "Lease device queue in use by AF_XDP"); 75 + return true; 76 + } 77 + if (netif_rxq_has_mp(dev, idx)) { 78 + NL_SET_ERR_MSG(extack, "Lease device queue in use by memory provider"); 79 + return true; 80 + } 81 + return false; 82 + }
+147 -22
net/core/netdev_rx_queue.c
··· 9 9 10 10 #include "page_pool_priv.h" 11 11 12 - /* See also page_pool_is_unreadable() */ 13 - bool netif_rxq_has_unreadable_mp(struct net_device *dev, int idx) 12 + void netdev_rx_queue_lease(struct netdev_rx_queue *rxq_dst, 13 + struct netdev_rx_queue *rxq_src) 14 14 { 15 - struct netdev_rx_queue *rxq = __netif_get_rx_queue(dev, idx); 15 + netdev_assert_locked(rxq_src->dev); 16 + netdev_assert_locked(rxq_dst->dev); 16 17 17 - return !!rxq->mp_params.mp_ops; 18 + netdev_hold(rxq_src->dev, &rxq_src->lease_tracker, GFP_KERNEL); 19 + 20 + WRITE_ONCE(rxq_src->lease, rxq_dst); 21 + WRITE_ONCE(rxq_dst->lease, rxq_src); 22 + } 23 + 24 + void netdev_rx_queue_unlease(struct netdev_rx_queue *rxq_dst, 25 + struct netdev_rx_queue *rxq_src) 26 + { 27 + netdev_assert_locked(rxq_dst->dev); 28 + netdev_assert_locked(rxq_src->dev); 29 + 30 + WRITE_ONCE(rxq_src->lease, NULL); 31 + WRITE_ONCE(rxq_dst->lease, NULL); 32 + 33 + netdev_put(rxq_src->dev, &rxq_src->lease_tracker); 34 + } 35 + 36 + bool netif_rxq_is_leased(struct net_device *dev, unsigned int rxq_idx) 37 + { 38 + if (rxq_idx < dev->real_num_rx_queues) 39 + return READ_ONCE(__netif_get_rx_queue(dev, rxq_idx)->lease); 40 + return false; 41 + } 42 + 43 + static bool netif_lease_dir_ok(const struct net_device *dev, 44 + enum netif_lease_dir dir) 45 + { 46 + if (dir == NETIF_VIRT_TO_PHYS && !dev->dev.parent) 47 + return true; 48 + if (dir == NETIF_PHYS_TO_VIRT && dev->dev.parent) 49 + return true; 50 + return false; 51 + } 52 + 53 + struct netdev_rx_queue * 54 + __netif_get_rx_queue_lease(struct net_device **dev, unsigned int *rxq_idx, 55 + enum netif_lease_dir dir) 56 + { 57 + struct net_device *orig_dev = *dev; 58 + struct netdev_rx_queue *rxq = __netif_get_rx_queue(orig_dev, *rxq_idx); 59 + 60 + if (rxq->lease) { 61 + if (!netif_lease_dir_ok(orig_dev, dir)) 62 + return NULL; 63 + rxq = rxq->lease; 64 + *rxq_idx = get_netdev_rx_queue_index(rxq); 65 + *dev = rxq->dev; 66 + } 67 + return rxq; 68 + } 69 + 70 + struct netdev_rx_queue * 71 + netif_get_rx_queue_lease_locked(struct net_device **dev, unsigned int *rxq_idx) 72 + { 73 + struct net_device *orig_dev = *dev; 74 + struct netdev_rx_queue *rxq; 75 + 76 + /* Locking order is always from the virtual to the physical device 77 + * see netdev_nl_queue_create_doit(). 78 + */ 79 + netdev_ops_assert_locked(orig_dev); 80 + rxq = __netif_get_rx_queue_lease(dev, rxq_idx, NETIF_VIRT_TO_PHYS); 81 + if (rxq && orig_dev != *dev) 82 + netdev_lock(*dev); 83 + return rxq; 84 + } 85 + 86 + void netif_put_rx_queue_lease_locked(struct net_device *orig_dev, 87 + struct net_device *dev) 88 + { 89 + if (orig_dev != dev) 90 + netdev_unlock(dev); 91 + } 92 + 93 + bool netif_rx_queue_lease_get_owner(struct net_device **dev, 94 + unsigned int *rxq_idx) 95 + { 96 + struct net_device *orig_dev = *dev; 97 + struct netdev_rx_queue *rxq; 98 + 99 + /* The physical device needs to be locked. If there is indeed a lease, 100 + * then the virtual device holds a reference on the physical device 101 + * and the lease stays active until the virtual device is torn down. 102 + * When queues get {un,}leased both devices are always locked. 103 + */ 104 + netdev_ops_assert_locked(orig_dev); 105 + rxq = __netif_get_rx_queue_lease(dev, rxq_idx, NETIF_PHYS_TO_VIRT); 106 + if (rxq && orig_dev != *dev) 107 + return true; 108 + return false; 109 + } 110 + 111 + /* See also page_pool_is_unreadable() */ 112 + bool netif_rxq_has_unreadable_mp(struct net_device *dev, unsigned int rxq_idx) 113 + { 114 + if (rxq_idx < dev->real_num_rx_queues) 115 + return __netif_get_rx_queue(dev, rxq_idx)->mp_params.mp_ops; 116 + return false; 18 117 } 19 118 EXPORT_SYMBOL(netif_rxq_has_unreadable_mp); 119 + 120 + bool netif_rxq_has_mp(struct net_device *dev, unsigned int rxq_idx) 121 + { 122 + if (rxq_idx < dev->real_num_rx_queues) 123 + return __netif_get_rx_queue(dev, rxq_idx)->mp_params.mp_priv; 124 + return false; 125 + } 20 126 21 127 int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq_idx) 22 128 { ··· 206 100 const struct pp_memory_provider_params *p, 207 101 struct netlink_ext_ack *extack) 208 102 { 103 + struct net_device *orig_dev = dev; 209 104 struct netdev_rx_queue *rxq; 210 105 int ret; 211 106 212 107 if (!netdev_need_ops_lock(dev)) 213 108 return -EOPNOTSUPP; 214 - 215 109 if (rxq_idx >= dev->real_num_rx_queues) { 216 110 NL_SET_ERR_MSG(extack, "rx queue index out of range"); 217 111 return -ERANGE; 218 112 } 219 - rxq_idx = array_index_nospec(rxq_idx, dev->real_num_rx_queues); 220 113 114 + rxq_idx = array_index_nospec(rxq_idx, dev->real_num_rx_queues); 115 + rxq = netif_get_rx_queue_lease_locked(&dev, &rxq_idx); 116 + if (!rxq) { 117 + NL_SET_ERR_MSG(extack, "rx queue peered to a virtual netdev"); 118 + return -EBUSY; 119 + } 120 + if (!dev->dev.parent) { 121 + NL_SET_ERR_MSG(extack, "rx queue is mapped to a virtual netdev"); 122 + ret = -EBUSY; 123 + goto out; 124 + } 221 125 if (dev->cfg->hds_config != ETHTOOL_TCP_DATA_SPLIT_ENABLED) { 222 126 NL_SET_ERR_MSG(extack, "tcp-data-split is disabled"); 223 - return -EINVAL; 127 + ret = -EINVAL; 128 + goto out; 224 129 } 225 130 if (dev->cfg->hds_thresh) { 226 131 NL_SET_ERR_MSG(extack, "hds-thresh is not zero"); 227 - return -EINVAL; 132 + ret = -EINVAL; 133 + goto out; 228 134 } 229 135 if (dev_xdp_prog_count(dev)) { 230 136 NL_SET_ERR_MSG(extack, "unable to custom memory provider to device with XDP program attached"); 231 - return -EEXIST; 137 + ret = -EEXIST; 138 + goto out; 232 139 } 233 - 234 - rxq = __netif_get_rx_queue(dev, rxq_idx); 235 140 if (rxq->mp_params.mp_ops) { 236 141 NL_SET_ERR_MSG(extack, "designated queue already memory provider bound"); 237 - return -EEXIST; 142 + ret = -EEXIST; 143 + goto out; 238 144 } 239 145 #ifdef CONFIG_XDP_SOCKETS 240 146 if (rxq->pool) { 241 147 NL_SET_ERR_MSG(extack, "designated queue already in use by AF_XDP"); 242 - return -EBUSY; 148 + ret = -EBUSY; 149 + goto out; 243 150 } 244 151 #endif 245 - 246 152 rxq->mp_params = *p; 247 153 ret = netdev_rx_queue_restart(dev, rxq_idx); 248 154 if (ret) { 249 155 rxq->mp_params.mp_ops = NULL; 250 156 rxq->mp_params.mp_priv = NULL; 251 157 } 158 + out: 159 + netif_put_rx_queue_lease_locked(orig_dev, dev); 252 160 return ret; 253 161 } 254 162 ··· 277 157 return ret; 278 158 } 279 159 280 - void __net_mp_close_rxq(struct net_device *dev, unsigned int ifq_idx, 160 + void __net_mp_close_rxq(struct net_device *dev, unsigned int rxq_idx, 281 161 const struct pp_memory_provider_params *old_p) 282 162 { 163 + struct net_device *orig_dev = dev; 283 164 struct netdev_rx_queue *rxq; 284 165 int err; 285 166 286 - if (WARN_ON_ONCE(ifq_idx >= dev->real_num_rx_queues)) 167 + if (WARN_ON_ONCE(rxq_idx >= dev->real_num_rx_queues)) 287 168 return; 288 169 289 - rxq = __netif_get_rx_queue(dev, ifq_idx); 170 + rxq = netif_get_rx_queue_lease_locked(&dev, &rxq_idx); 171 + if (WARN_ON_ONCE(!rxq)) 172 + return; 290 173 291 174 /* Callers holding a netdev ref may get here after we already 292 175 * went thru shutdown via dev_memory_provider_uninstall(). 293 176 */ 294 177 if (dev->reg_state > NETREG_REGISTERED && 295 178 !rxq->mp_params.mp_ops) 296 - return; 179 + goto out; 297 180 298 181 if (WARN_ON_ONCE(rxq->mp_params.mp_ops != old_p->mp_ops || 299 182 rxq->mp_params.mp_priv != old_p->mp_priv)) 300 - return; 183 + goto out; 301 184 302 185 rxq->mp_params.mp_ops = NULL; 303 186 rxq->mp_params.mp_priv = NULL; 304 - err = netdev_rx_queue_restart(dev, ifq_idx); 187 + err = netdev_rx_queue_restart(dev, rxq_idx); 305 188 WARN_ON(err && err != -ENETDOWN); 189 + out: 190 + netif_put_rx_queue_lease_locked(orig_dev, dev); 306 191 } 307 192 308 - void net_mp_close_rxq(struct net_device *dev, unsigned ifq_idx, 193 + void net_mp_close_rxq(struct net_device *dev, unsigned int rxq_idx, 309 194 struct pp_memory_provider_params *old_p) 310 195 { 311 196 netdev_lock(dev); 312 - __net_mp_close_rxq(dev, ifq_idx, old_p); 197 + __net_mp_close_rxq(dev, rxq_idx, old_p); 313 198 netdev_unlock(dev); 314 199 }
+7 -5
net/ethtool/channels.c
··· 1 1 // SPDX-License-Identifier: GPL-2.0-only 2 2 3 - #include <net/xdp_sock_drv.h> 3 + #include <net/netdev_queues.h> 4 4 5 5 #include "netlink.h" 6 6 #include "common.h" ··· 169 169 if (ret) 170 170 return ret; 171 171 172 - /* Disabling channels, query zero-copy AF_XDP sockets */ 172 + /* ensure channels are not busy at the moment */ 173 173 from_channel = channels.combined_count + 174 174 min(channels.rx_count, channels.tx_count); 175 - for (i = from_channel; i < old_total; i++) 176 - if (xsk_get_pool_from_qid(dev, i)) { 177 - GENL_SET_ERR_MSG(info, "requested channel counts are too low for existing zerocopy AF_XDP sockets"); 175 + for (i = from_channel; i < old_total; i++) { 176 + if (netdev_queue_busy(dev, i, NULL)) { 177 + GENL_SET_ERR_MSG(info, 178 + "requested channel counts are too low due to busy queues (AF_XDP or queue leasing)"); 178 179 return -EINVAL; 179 180 } 181 + } 180 182 181 183 ret = dev->ethtool_ops->set_channels(dev, &channels); 182 184 return ret < 0 ? ret : 1;
+5 -4
net/ethtool/ioctl.c
··· 27 27 #include <linux/net.h> 28 28 #include <linux/pm_runtime.h> 29 29 #include <linux/utsname.h> 30 + #include <linux/ethtool_netlink.h> 30 31 #include <net/devlink.h> 31 32 #include <net/ipv6.h> 32 - #include <net/xdp_sock_drv.h> 33 33 #include <net/flow_offload.h> 34 34 #include <net/netdev_lock.h> 35 - #include <linux/ethtool_netlink.h> 35 + #include <net/netdev_queues.h> 36 + 36 37 #include "common.h" 37 38 38 39 /* State held across locks and calls for commands which have devlink fallback */ ··· 2283 2282 if (ret) 2284 2283 return ret; 2285 2284 2286 - /* Disabling channels, query zero-copy AF_XDP sockets */ 2285 + /* Disabling channels, query busy queues (AF_XDP, queue leasing) */ 2287 2286 from_channel = channels.combined_count + 2288 2287 min(channels.rx_count, channels.tx_count); 2289 2288 to_channel = curr.combined_count + max(curr.rx_count, curr.tx_count); 2290 2289 for (i = from_channel; i < to_channel; i++) 2291 - if (xsk_get_pool_from_qid(dev, i)) 2290 + if (netdev_queue_busy(dev, i, NULL)) 2292 2291 return -EINVAL; 2293 2292 2294 2293 ret = dev->ethtool_ops->set_channels(dev, &channels);
+62 -17
net/xdp/xsk.c
··· 23 23 #include <linux/netdevice.h> 24 24 #include <linux/rculist.h> 25 25 #include <linux/vmalloc.h> 26 + 27 + #include <net/netdev_queues.h> 26 28 #include <net/xdp_sock_drv.h> 27 29 #include <net/busy_poll.h> 28 30 #include <net/netdev_lock.h> ··· 105 103 } 106 104 EXPORT_SYMBOL(xsk_uses_need_wakeup); 107 105 108 - struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev, 106 + struct xsk_buff_pool *xsk_get_pool_from_qid(const struct net_device *dev, 109 107 u16 queue_id) 110 108 { 111 109 if (queue_id < dev->real_num_rx_queues) ··· 119 117 120 118 void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id) 121 119 { 122 - if (queue_id < dev->num_rx_queues) 123 - dev->_rx[queue_id].pool = NULL; 124 - if (queue_id < dev->num_tx_queues) 125 - dev->_tx[queue_id].pool = NULL; 120 + struct net_device *orig_dev = dev; 121 + unsigned int id = queue_id; 122 + 123 + if (id < dev->real_num_rx_queues) 124 + WARN_ON_ONCE(!netif_get_rx_queue_lease_locked(&dev, &id)); 125 + 126 + if (id < dev->real_num_rx_queues) 127 + dev->_rx[id].pool = NULL; 128 + if (id < dev->real_num_tx_queues) 129 + dev->_tx[id].pool = NULL; 130 + 131 + netif_put_rx_queue_lease_locked(orig_dev, dev); 126 132 } 127 133 128 134 /* The buffer pool is stored both in the _rx struct and the _tx struct as we do ··· 140 130 int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool, 141 131 u16 queue_id) 142 132 { 143 - if (queue_id >= max_t(unsigned int, 144 - dev->real_num_rx_queues, 145 - dev->real_num_tx_queues)) 133 + struct net_device *orig_dev = dev; 134 + unsigned int id = queue_id; 135 + int ret = 0; 136 + 137 + if (id >= max(dev->real_num_rx_queues, 138 + dev->real_num_tx_queues)) 146 139 return -EINVAL; 140 + if (id < dev->real_num_rx_queues) { 141 + if (!netif_get_rx_queue_lease_locked(&dev, &id)) 142 + return -EBUSY; 143 + if (xsk_get_pool_from_qid(dev, id)) { 144 + ret = -EBUSY; 145 + goto out; 146 + } 147 + } 147 148 148 - if (queue_id < dev->real_num_rx_queues) 149 - dev->_rx[queue_id].pool = pool; 150 - if (queue_id < dev->real_num_tx_queues) 151 - dev->_tx[queue_id].pool = pool; 152 - 153 - return 0; 149 + if (id < dev->real_num_rx_queues) 150 + dev->_rx[id].pool = pool; 151 + if (id < dev->real_num_tx_queues) 152 + dev->_tx[id].pool = pool; 153 + out: 154 + netif_put_rx_queue_lease_locked(orig_dev, dev); 155 + return ret; 154 156 } 155 157 156 158 static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff_xsk *xskb, u32 len, ··· 346 324 return false; 347 325 } 348 326 327 + static bool xsk_dev_queue_valid(const struct xdp_sock *xs, 328 + const struct xdp_rxq_info *info) 329 + { 330 + struct net_device *dev = xs->dev; 331 + u32 queue_index = xs->queue_id; 332 + struct netdev_rx_queue *rxq; 333 + 334 + if (info->dev == dev && 335 + info->queue_index == queue_index) 336 + return true; 337 + 338 + if (queue_index < dev->real_num_rx_queues) { 339 + rxq = READ_ONCE(__netif_get_rx_queue(dev, queue_index)->lease); 340 + if (!rxq) 341 + return false; 342 + 343 + dev = rxq->dev; 344 + queue_index = get_netdev_rx_queue_index(rxq); 345 + 346 + return info->dev == dev && 347 + info->queue_index == queue_index; 348 + } 349 + return false; 350 + } 351 + 349 352 static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) 350 353 { 351 354 if (!xsk_is_bound(xs)) 352 355 return -ENXIO; 353 - 354 - if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) 356 + if (!xsk_dev_queue_valid(xs, xdp->rxq)) 355 357 return -EINVAL; 356 - 357 358 if (len > xsk_pool_get_rx_frame_size(xs->pool) && !xs->sg) { 358 359 xs->rx_dropped++; 359 360 return -ENOSPC;
+11
tools/include/uapi/linux/netdev.h
··· 160 160 NETDEV_A_QUEUE_DMABUF, 161 161 NETDEV_A_QUEUE_IO_URING, 162 162 NETDEV_A_QUEUE_XSK, 163 + NETDEV_A_QUEUE_LEASE, 163 164 164 165 __NETDEV_A_QUEUE_MAX, 165 166 NETDEV_A_QUEUE_MAX = (__NETDEV_A_QUEUE_MAX - 1) ··· 204 203 }; 205 204 206 205 enum { 206 + NETDEV_A_LEASE_IFINDEX = 1, 207 + NETDEV_A_LEASE_QUEUE, 208 + NETDEV_A_LEASE_NETNS_ID, 209 + 210 + __NETDEV_A_LEASE_MAX, 211 + NETDEV_A_LEASE_MAX = (__NETDEV_A_LEASE_MAX - 1) 212 + }; 213 + 214 + enum { 207 215 NETDEV_A_DMABUF_IFINDEX = 1, 208 216 NETDEV_A_DMABUF_QUEUES, 209 217 NETDEV_A_DMABUF_FD, ··· 238 228 NETDEV_CMD_BIND_RX, 239 229 NETDEV_CMD_NAPI_SET, 240 230 NETDEV_CMD_BIND_TX, 231 + NETDEV_CMD_QUEUE_CREATE, 241 232 242 233 __NETDEV_CMD_MAX, 243 234 NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1)
+7
tools/testing/selftests/drivers/net/README.rst
··· 62 62 63 63 Local and remote endpoint IP addresses. 64 64 65 + LOCAL_PREFIX_V4, LOCAL_PREFIX_V6 66 + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 67 + 68 + Local IP prefix/subnet which can be used to allocate extra IP addresses (for 69 + network name spaces behind macvlan, veth, netkit devices). DUT must be 70 + reachable using these addresses from the endpoint. 71 + 65 72 REMOTE_TYPE 66 73 ~~~~~~~~~~~ 67 74
+2
tools/testing/selftests/drivers/net/hw/Makefile
··· 32 32 irq.py \ 33 33 loopback.sh \ 34 34 nic_timestamp.py \ 35 + nk_netns.py \ 36 + nk_qlease.py \ 35 37 pp_alloc_fail.py \ 36 38 rss_api.py \ 37 39 rss_ctx.py \
+4 -3
tools/testing/selftests/drivers/net/hw/lib/py/__init__.py
··· 3 3 """ 4 4 Driver test environment (hardware-only tests). 5 5 NetDrvEnv and NetDrvEpEnv are the main environment classes. 6 + NetDrvContEnv extends NetDrvEpEnv with netkit container support. 6 7 Former is for local host only tests, latter creates / connects 7 8 to a remote endpoint. See NIPA wiki for more information about 8 9 running and writing driver tests. ··· 30 29 from net.lib.py import ksft_eq, ksft_ge, ksft_in, ksft_is, ksft_lt, \ 31 30 ksft_ne, ksft_not_in, ksft_raises, ksft_true, ksft_gt, ksft_not_none 32 31 from drivers.net.lib.py import GenerateTraffic, Remote, Iperf3Runner 33 - from drivers.net.lib.py import NetDrvEnv, NetDrvEpEnv 32 + from drivers.net.lib.py import NetDrvEnv, NetDrvEpEnv, NetDrvContEnv 34 33 35 34 __all__ = ["NetNS", "NetNSEnter", "NetdevSimDev", 36 35 "EthtoolFamily", "NetdevFamily", "NetshaperFamily", ··· 45 44 "ksft_eq", "ksft_ge", "ksft_in", "ksft_is", "ksft_lt", 46 45 "ksft_ne", "ksft_not_in", "ksft_raises", "ksft_true", "ksft_gt", 47 46 "ksft_not_none", "ksft_not_none", 48 - "NetDrvEnv", "NetDrvEpEnv", "GenerateTraffic", "Remote", 49 - "Iperf3Runner"] 47 + "NetDrvEnv", "NetDrvEpEnv", "NetDrvContEnv", "GenerateTraffic", 48 + "Remote", "Iperf3Runner"] 50 49 except ModuleNotFoundError as e: 51 50 print("Failed importing `net` library from kernel sources") 52 51 print(str(e))
+49
tools/testing/selftests/drivers/net/hw/nk_forward.bpf.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + #include <linux/bpf.h> 3 + #include <linux/pkt_cls.h> 4 + #include <linux/if_ether.h> 5 + #include <linux/ipv6.h> 6 + #include <linux/in6.h> 7 + #include <bpf/bpf_endian.h> 8 + #include <bpf/bpf_helpers.h> 9 + 10 + #define TC_ACT_OK 0 11 + #define ETH_P_IPV6 0x86DD 12 + 13 + #define ctx_ptr(field) ((void *)(long)(field)) 14 + 15 + #define v6_p64_equal(a, b) (a.s6_addr32[0] == b.s6_addr32[0] && \ 16 + a.s6_addr32[1] == b.s6_addr32[1]) 17 + 18 + volatile __u32 netkit_ifindex; 19 + volatile __u8 ipv6_prefix[16]; 20 + 21 + SEC("tc/ingress") 22 + int tc_redirect_peer(struct __sk_buff *skb) 23 + { 24 + void *data_end = ctx_ptr(skb->data_end); 25 + void *data = ctx_ptr(skb->data); 26 + struct in6_addr *peer_addr; 27 + struct ipv6hdr *ip6h; 28 + struct ethhdr *eth; 29 + 30 + peer_addr = (struct in6_addr *)ipv6_prefix; 31 + 32 + if (skb->protocol != bpf_htons(ETH_P_IPV6)) 33 + return TC_ACT_OK; 34 + 35 + eth = data; 36 + if ((void *)(eth + 1) > data_end) 37 + return TC_ACT_OK; 38 + 39 + ip6h = data + sizeof(struct ethhdr); 40 + if ((void *)(ip6h + 1) > data_end) 41 + return TC_ACT_OK; 42 + 43 + if (!v6_p64_equal(ip6h->daddr, (*peer_addr))) 44 + return TC_ACT_OK; 45 + 46 + return bpf_redirect_peer(netkit_ifindex, 0); 47 + } 48 + 49 + char __license[] SEC("license") = "GPL";
+23
tools/testing/selftests/drivers/net/hw/nk_netns.py
··· 1 + #!/usr/bin/env python3 2 + # SPDX-License-Identifier: GPL-2.0 3 + 4 + from lib.py import ksft_run, ksft_exit 5 + from lib.py import NetDrvContEnv 6 + from lib.py import cmd 7 + 8 + 9 + def test_ping(cfg) -> None: 10 + cfg.require_ipver("6") 11 + 12 + cmd(f"ping -c 1 -W5 {cfg.nk_guest_ipv6}", host=cfg.remote) 13 + cmd(f"ping -c 1 -W5 {cfg.remote_addr_v['6']}", ns=cfg.netns) 14 + 15 + 16 + def main() -> None: 17 + with NetDrvContEnv(__file__) as cfg: 18 + ksft_run([test_ping], args=(cfg,)) 19 + ksft_exit() 20 + 21 + 22 + if __name__ == "__main__": 23 + main()
+55
tools/testing/selftests/drivers/net/hw/nk_qlease.py
··· 1 + #!/usr/bin/env python3 2 + # SPDX-License-Identifier: GPL-2.0 3 + 4 + import re 5 + from os import path 6 + from lib.py import ksft_run, ksft_exit 7 + from lib.py import NetDrvContEnv 8 + from lib.py import bkg, cmd, defer, ethtool, rand_port, wait_port_listen 9 + 10 + 11 + def create_rss_ctx(cfg): 12 + output = ethtool(f"-X {cfg.ifname} context new start {cfg.src_queue} equal 1").stdout 13 + values = re.search(r'New RSS context is (\d+)', output).group(1) 14 + return int(values) 15 + 16 + 17 + def set_flow_rule(cfg): 18 + output = ethtool(f"-N {cfg.ifname} flow-type tcp6 dst-port {cfg.port} action {cfg.src_queue}").stdout 19 + values = re.search(r'ID (\d+)', output).group(1) 20 + return int(values) 21 + 22 + 23 + def set_flow_rule_rss(cfg, rss_ctx_id): 24 + output = ethtool(f"-N {cfg.ifname} flow-type tcp6 dst-port {cfg.port} context {rss_ctx_id}").stdout 25 + values = re.search(r'ID (\d+)', output).group(1) 26 + return int(values) 27 + 28 + 29 + def test_iou_zcrx(cfg) -> None: 30 + cfg.require_ipver('6') 31 + 32 + ethtool(f"-X {cfg.ifname} equal {cfg.src_queue}") 33 + defer(ethtool, f"-X {cfg.ifname} default") 34 + 35 + flow_rule_id = set_flow_rule(cfg) 36 + defer(ethtool, f"-N {cfg.ifname} delete {flow_rule_id}") 37 + 38 + rx_cmd = f"ip netns exec {cfg.netns.name} {cfg.bin_local} -s -p {cfg.port} -i {cfg._nk_guest_ifname} -q {cfg.nk_queue}" 39 + tx_cmd = f"{cfg.bin_remote} -c -h {cfg.nk_guest_ipv6} -p {cfg.port} -l 12840" 40 + with bkg(rx_cmd, exit_wait=True): 41 + wait_port_listen(cfg.port, proto="tcp", ns=cfg.netns) 42 + cmd(tx_cmd, host=cfg.remote) 43 + 44 + 45 + def main() -> None: 46 + with NetDrvContEnv(__file__, lease=True) as cfg: 47 + cfg.bin_local = path.abspath(path.dirname(__file__) + "/../../../drivers/net/hw/iou-zcrx") 48 + cfg.bin_remote = cfg.remote.deploy(cfg.bin_local) 49 + cfg.port = rand_port() 50 + ksft_run([test_iou_zcrx], args=(cfg,)) 51 + ksft_exit() 52 + 53 + 54 + if __name__ == "__main__": 55 + main()
+4 -3
tools/testing/selftests/drivers/net/lib/py/__init__.py
··· 3 3 """ 4 4 Driver test environment. 5 5 NetDrvEnv and NetDrvEpEnv are the main environment classes. 6 + NetDrvContEnv extends NetDrvEpEnv with netkit container support. 6 7 Former is for local host only tests, latter creates / connects 7 8 to a remote endpoint. See NIPA wiki for more information about 8 9 running and writing driver tests. ··· 44 43 "ksft_ne", "ksft_not_in", "ksft_raises", "ksft_true", "ksft_gt", 45 44 "ksft_not_none", "ksft_not_none"] 46 45 47 - from .env import NetDrvEnv, NetDrvEpEnv 46 + from .env import NetDrvEnv, NetDrvEpEnv, NetDrvContEnv 48 47 from .load import GenerateTraffic, Iperf3Runner 49 48 from .remote import Remote 50 49 51 - __all__ += ["NetDrvEnv", "NetDrvEpEnv", "GenerateTraffic", "Remote", 52 - "Iperf3Runner"] 50 + __all__ += ["NetDrvEnv", "NetDrvEpEnv", "NetDrvContEnv", "GenerateTraffic", 51 + "Remote", "Iperf3Runner"] 53 52 except ModuleNotFoundError as e: 54 53 print("Failed importing `net` library from kernel sources") 55 54 print(str(e))
+157
tools/testing/selftests/drivers/net/lib/py/env.py
··· 1 1 # SPDX-License-Identifier: GPL-2.0 2 2 3 + import ipaddress 3 4 import os 5 + import re 4 6 import time 5 7 from pathlib import Path 6 8 from lib.py import KsftSkipEx, KsftXfailEx 7 9 from lib.py import ksft_setup, wait_file 8 10 from lib.py import cmd, ethtool, ip, CmdExitFailure 9 11 from lib.py import NetNS, NetdevSimDev 12 + from lib.py import NetdevFamily, EthtoolFamily 10 13 from .remote import Remote 14 + from . import bpftool 11 15 12 16 13 17 class NetDrvEnvBase: ··· 293 289 data.get('stats-block-usecs', 0) / 1000 / 1000 294 290 295 291 time.sleep(self._stats_settle_time) 292 + 293 + 294 + class NetDrvContEnv(NetDrvEpEnv): 295 + """ 296 + Class for an environment with a netkit pair setup for forwarding traffic 297 + between the physical interface and a network namespace. 298 + """ 299 + 300 + def __init__(self, src_path, lease=False, **kwargs): 301 + super().__init__(src_path, **kwargs) 302 + 303 + self.require_ipver("6") 304 + local_prefix = self.env.get("LOCAL_PREFIX_V6") 305 + if not local_prefix: 306 + raise KsftSkipEx("LOCAL_PREFIX_V6 required") 307 + 308 + self.netdevnl = NetdevFamily() 309 + self.ethnl = EthtoolFamily() 310 + 311 + local_prefix = local_prefix.rstrip("/64").rstrip("::").rstrip(":") 312 + self.ipv6_prefix = f"{local_prefix}::" 313 + self.nk_host_ipv6 = f"{local_prefix}::2:1" 314 + self.nk_guest_ipv6 = f"{local_prefix}::2:2" 315 + 316 + self.netns = None 317 + self._nk_host_ifname = None 318 + self._nk_guest_ifname = None 319 + self._tc_attached = False 320 + self._bpf_prog_pref = None 321 + self._bpf_prog_id = None 322 + self._leased = False 323 + 324 + nk_rxqueues = 1 325 + if lease: 326 + nk_rxqueues = 2 327 + ip(f"link add type netkit mode l2 forward peer forward numrxqueues {nk_rxqueues}") 328 + 329 + all_links = ip("-d link show", json=True) 330 + netkit_links = [link for link in all_links 331 + if link.get('linkinfo', {}).get('info_kind') == 'netkit' 332 + and 'UP' not in link.get('flags', [])] 333 + 334 + if len(netkit_links) != 2: 335 + raise KsftSkipEx("Failed to create netkit pair") 336 + 337 + netkit_links.sort(key=lambda x: x['ifindex']) 338 + self._nk_host_ifname = netkit_links[1]['ifname'] 339 + self._nk_guest_ifname = netkit_links[0]['ifname'] 340 + self.nk_host_ifindex = netkit_links[1]['ifindex'] 341 + self.nk_guest_ifindex = netkit_links[0]['ifindex'] 342 + 343 + if lease: 344 + self._lease_queues() 345 + 346 + self._setup_ns() 347 + self._attach_bpf() 348 + 349 + def __del__(self): 350 + if self._tc_attached: 351 + cmd(f"tc filter del dev {self.ifname} ingress pref {self._bpf_prog_pref}") 352 + self._tc_attached = False 353 + 354 + if self._nk_host_ifname: 355 + cmd(f"ip link del dev {self._nk_host_ifname}") 356 + self._nk_host_ifname = None 357 + self._nk_guest_ifname = None 358 + 359 + if self.netns: 360 + del self.netns 361 + self.netns = None 362 + 363 + if self._leased: 364 + self.ethnl.rings_set({'header': {'dev-index': self.ifindex}, 365 + 'tcp-data-split': 'unknown', 366 + 'hds-thresh': self._hds_thresh, 367 + 'rx': self._rx_rings}) 368 + self._leased = False 369 + 370 + super().__del__() 371 + 372 + def _lease_queues(self): 373 + channels = self.ethnl.channels_get({'header': {'dev-index': self.ifindex}}) 374 + channels = channels['combined-count'] 375 + if channels < 2: 376 + raise KsftSkipEx('Test requires NETIF with at least 2 combined channels') 377 + 378 + rings = self.ethnl.rings_get({'header': {'dev-index': self.ifindex}}) 379 + self._rx_rings = rings['rx'] 380 + self._hds_thresh = rings.get('hds-thresh', 0) 381 + self.ethnl.rings_set({'header': {'dev-index': self.ifindex}, 382 + 'tcp-data-split': 'enabled', 383 + 'hds-thresh': 0, 384 + 'rx': 64}) 385 + self.src_queue = channels - 1 386 + bind_result = self.netdevnl.queue_create( 387 + { 388 + "ifindex": self.nk_guest_ifindex, 389 + "type": "rx", 390 + "lease": { 391 + "ifindex": self.ifindex, 392 + "queue": {"id": self.src_queue, "type": "rx"}, 393 + }, 394 + } 395 + ) 396 + self.nk_queue = bind_result['id'] 397 + self._leased = True 398 + 399 + def _setup_ns(self): 400 + self.netns = NetNS() 401 + ip(f"link set dev {self._nk_guest_ifname} netns {self.netns.name}") 402 + ip(f"link set dev {self._nk_host_ifname} up") 403 + ip(f"-6 addr add fe80::1/64 dev {self._nk_host_ifname} nodad") 404 + ip(f"-6 route add {self.nk_guest_ipv6}/128 via fe80::2 dev {self._nk_host_ifname}") 405 + 406 + ip("link set lo up", ns=self.netns) 407 + ip(f"link set dev {self._nk_guest_ifname} up", ns=self.netns) 408 + ip(f"-6 addr add fe80::2/64 dev {self._nk_guest_ifname}", ns=self.netns) 409 + ip(f"-6 addr add {self.nk_guest_ipv6}/64 dev {self._nk_guest_ifname} nodad", ns=self.netns) 410 + ip(f"-6 route add default via fe80::1 dev {self._nk_guest_ifname}", ns=self.netns) 411 + 412 + def _attach_bpf(self): 413 + bpf_obj = self.test_dir / "nk_forward.bpf.o" 414 + if not bpf_obj.exists(): 415 + raise KsftSkipEx("BPF prog not found") 416 + 417 + cmd(f"tc filter add dev {self.ifname} ingress bpf obj {bpf_obj} sec tc/ingress direct-action") 418 + self._tc_attached = True 419 + 420 + tc_info = cmd(f"tc filter show dev {self.ifname} ingress").stdout 421 + match = re.search(r'pref (\d+).*nk_forward\.bpf.*id (\d+)', tc_info) 422 + if not match: 423 + raise Exception("Failed to get BPF prog ID") 424 + self._bpf_prog_pref = int(match.group(1)) 425 + self._bpf_prog_id = int(match.group(2)) 426 + 427 + prog_info = bpftool(f"prog show id {self._bpf_prog_id}", json=True) 428 + map_ids = prog_info.get("map_ids", []) 429 + 430 + bss_map_id = None 431 + for map_id in map_ids: 432 + map_info = bpftool(f"map show id {map_id}", json=True) 433 + if map_info.get("name").endswith("bss"): 434 + bss_map_id = map_id 435 + 436 + if bss_map_id is None: 437 + raise Exception("Failed to find .bss map") 438 + 439 + ipv6_addr = ipaddress.IPv6Address(self.ipv6_prefix) 440 + ipv6_bytes = ipv6_addr.packed 441 + ifindex_bytes = self.nk_host_ifindex.to_bytes(4, byteorder='little') 442 + value = ipv6_bytes + ifindex_bytes 443 + value_hex = ' '.join(f'{b:02x}' for b in value) 444 + bpftool(f"map update id {bss_map_id} key hex 00 00 00 00 value hex {value_hex}")