Merge branch 'device-memory-tcp' · tjh.dev/kernel@e331673

+61

Documentation/netlink/specs/netdev.yaml

··· 167 167 "re-attached", they are just waiting to disappear. 168 168 Attribute is absent if Page Pool has not been detached, and 169 169 can still be used to allocate new memory. 170 + - 171 + name: dmabuf 172 + doc: ID of the dmabuf this page-pool is attached to. 173 + type: u32 170 174 - 171 175 name: page-pool-info 172 176 subset-of: page-pool ··· 271 267 - 272 268 name: napi-id 273 269 doc: ID of the NAPI instance which services this queue. 270 + type: u32 271 + - 272 + name: dmabuf 273 + doc: ID of the dmabuf attached to this queue, if any. 274 274 type: u32 275 275 276 276 - ··· 465 457 Number of times driver re-started accepting send 466 458 requests to this queue from the stack. 467 459 type: uint 460 + - 461 + name: queue-id 462 + subset-of: queue 463 + attributes: 464 + - 465 + name: id 466 + - 467 + name: type 468 + - 469 + name: dmabuf 470 + attributes: 471 + - 472 + name: ifindex 473 + doc: netdev ifindex to bind the dmabuf to. 474 + type: u32 475 + checks: 476 + min: 1 477 + - 478 + name: queues 479 + doc: receive queues to bind the dmabuf to. 480 + type: nest 481 + nested-attributes: queue-id 482 + multi-attr: true 483 + - 484 + name: fd 485 + doc: dmabuf file descriptor to bind. 486 + type: u32 487 + - 488 + name: id 489 + doc: id of the dmabuf binding 490 + type: u32 491 + checks: 492 + min: 1 468 493 469 494 operations: 470 495 list: ··· 551 510 - inflight 552 511 - inflight-mem 553 512 - detach-time 513 + - dmabuf 554 514 dump: 555 515 reply: *pp-reply 556 516 config-cond: page-pool ··· 616 574 - type 617 575 - napi-id 618 576 - ifindex 577 + - dmabuf 619 578 dump: 620 579 request: 621 580 attributes: ··· 662 619 - rx-bytes 663 620 - tx-packets 664 621 - tx-bytes 622 + - 623 + name: bind-rx 624 + doc: Bind dmabuf to netdev 625 + attribute-set: dmabuf 626 + flags: [ admin-perm ] 627 + do: 628 + request: 629 + attributes: 630 + - ifindex 631 + - fd 632 + - queues 633 + reply: 634 + attributes: 635 + - id 636 + 637 + kernel-family: 638 + headers: [ "linux/list.h"] 639 + sock-priv: struct list_head 665 640 666 641 mcast-groups: 667 642 list:

+269

Documentation/networking/devmem.rst

··· 1 + .. SPDX-License-Identifier: GPL-2.0 2 + 3 + ================= 4 + Device Memory TCP 5 + ================= 6 + 7 + 8 + Intro 9 + ===== 10 + 11 + Device memory TCP (devmem TCP) enables receiving data directly into device 12 + memory (dmabuf). The feature is currently implemented for TCP sockets. 13 + 14 + 15 + Opportunity 16 + ----------- 17 + 18 + A large number of data transfers have device memory as the source and/or 19 + destination. Accelerators drastically increased the prevalence of such 20 + transfers. Some examples include: 21 + 22 + - Distributed training, where ML accelerators, such as GPUs on different hosts, 23 + exchange data. 24 + 25 + - Distributed raw block storage applications transfer large amounts of data with 26 + remote SSDs. Much of this data does not require host processing. 27 + 28 + Typically the Device-to-Device data transfers in the network are implemented as 29 + the following low-level operations: Device-to-Host copy, Host-to-Host network 30 + transfer, and Host-to-Device copy. 31 + 32 + The flow involving host copies is suboptimal, especially for bulk data transfers, 33 + and can put significant strains on system resources such as host memory 34 + bandwidth and PCIe bandwidth. 35 + 36 + Devmem TCP optimizes this use case by implementing socket APIs that enable 37 + the user to receive incoming network packets directly into device memory. 38 + 39 + Packet payloads go directly from the NIC to device memory. 40 + 41 + Packet headers go to host memory and are processed by the TCP/IP stack 42 + normally. The NIC must support header split to achieve this. 43 + 44 + Advantages: 45 + 46 + - Alleviate host memory bandwidth pressure, compared to existing 47 + network-transfer + device-copy semantics. 48 + 49 + - Alleviate PCIe bandwidth pressure, by limiting data transfer to the lowest 50 + level of the PCIe tree, compared to the traditional path which sends data 51 + through the root complex. 52 + 53 + 54 + More Info 55 + --------- 56 + 57 + slides, video 58 + https://netdevconf.org/0x17/sessions/talk/device-memory-tcp.html 59 + 60 + patchset 61 + [PATCH net-next v24 00/13] Device Memory TCP 62 + https://lore.kernel.org/netdev/20240831004313.3713467-1-almasrymina@google.com/ 63 + 64 + 65 + Interface 66 + ========= 67 + 68 + 69 + Example 70 + ------- 71 + 72 + tools/testing/selftests/net/ncdevmem.c:do_server shows an example of setting up 73 + the RX path of this API. 74 + 75 + 76 + NIC Setup 77 + --------- 78 + 79 + Header split, flow steering, & RSS are required features for devmem TCP. 80 + 81 + Header split is used to split incoming packets into a header buffer in host 82 + memory, and a payload buffer in device memory. 83 + 84 + Flow steering & RSS are used to ensure that only flows targeting devmem land on 85 + an RX queue bound to devmem. 86 + 87 + Enable header split & flow steering:: 88 + 89 + # enable header split 90 + ethtool -G eth1 tcp-data-split on 91 + 92 + 93 + # enable flow steering 94 + ethtool -K eth1 ntuple on 95 + 96 + Configure RSS to steer all traffic away from the target RX queue (queue 15 in 97 + this example):: 98 + 99 + ethtool --set-rxfh-indir eth1 equal 15 100 + 101 + 102 + The user must bind a dmabuf to any number of RX queues on a given NIC using 103 + the netlink API:: 104 + 105 + /* Bind dmabuf to NIC RX queue 15 */ 106 + struct netdev_queue *queues; 107 + queues = malloc(sizeof(*queues) * 1); 108 + 109 + queues[0]._present.type = 1; 110 + queues[0]._present.idx = 1; 111 + queues[0].type = NETDEV_RX_QUEUE_TYPE_RX; 112 + queues[0].idx = 15; 113 + 114 + *ys = ynl_sock_create(&ynl_netdev_family, &yerr); 115 + 116 + req = netdev_bind_rx_req_alloc(); 117 + netdev_bind_rx_req_set_ifindex(req, 1 /* ifindex */); 118 + netdev_bind_rx_req_set_dmabuf_fd(req, dmabuf_fd); 119 + __netdev_bind_rx_req_set_queues(req, queues, n_queue_index); 120 + 121 + rsp = netdev_bind_rx(*ys, req); 122 + 123 + dmabuf_id = rsp->dmabuf_id; 124 + 125 + 126 + The netlink API returns a dmabuf_id: a unique ID that refers to this dmabuf 127 + that has been bound. 128 + 129 + The user can unbind the dmabuf from the netdevice by closing the netlink socket 130 + that established the binding. We do this so that the binding is automatically 131 + unbound even if the userspace process crashes. 132 + 133 + Note that any reasonably well-behaved dmabuf from any exporter should work with 134 + devmem TCP, even if the dmabuf is not actually backed by devmem. An example of 135 + this is udmabuf, which wraps user memory (non-devmem) in a dmabuf. 136 + 137 + 138 + Socket Setup 139 + ------------ 140 + 141 + The socket must be flow steered to the dmabuf bound RX queue:: 142 + 143 + ethtool -N eth1 flow-type tcp4 ... queue 15 144 + 145 + 146 + Receiving data 147 + -------------- 148 + 149 + The user application must signal to the kernel that it is capable of receiving 150 + devmem data by passing the MSG_SOCK_DEVMEM flag to recvmsg:: 151 + 152 + ret = recvmsg(fd, &msg, MSG_SOCK_DEVMEM); 153 + 154 + Applications that do not specify the MSG_SOCK_DEVMEM flag will receive an EFAULT 155 + on devmem data. 156 + 157 + Devmem data is received directly into the dmabuf bound to the NIC in 'NIC 158 + Setup', and the kernel signals such to the user via the SCM_DEVMEM_* cmsgs:: 159 + 160 + for (cm = CMSG_FIRSTHDR(&msg); cm; cm = CMSG_NXTHDR(&msg, cm)) { 161 + if (cm->cmsg_level != SOL_SOCKET || 162 + (cm->cmsg_type != SCM_DEVMEM_DMABUF && 163 + cm->cmsg_type != SCM_DEVMEM_LINEAR)) 164 + continue; 165 + 166 + dmabuf_cmsg = (struct dmabuf_cmsg *)CMSG_DATA(cm); 167 + 168 + if (cm->cmsg_type == SCM_DEVMEM_DMABUF) { 169 + /* Frag landed in dmabuf. 170 + * 171 + * dmabuf_cmsg->dmabuf_id is the dmabuf the 172 + * frag landed on. 173 + * 174 + * dmabuf_cmsg->frag_offset is the offset into 175 + * the dmabuf where the frag starts. 176 + * 177 + * dmabuf_cmsg->frag_size is the size of the 178 + * frag. 179 + * 180 + * dmabuf_cmsg->frag_token is a token used to 181 + * refer to this frag for later freeing. 182 + */ 183 + 184 + struct dmabuf_token token; 185 + token.token_start = dmabuf_cmsg->frag_token; 186 + token.token_count = 1; 187 + continue; 188 + } 189 + 190 + if (cm->cmsg_type == SCM_DEVMEM_LINEAR) 191 + /* Frag landed in linear buffer. 192 + * 193 + * dmabuf_cmsg->frag_size is the size of the 194 + * frag. 195 + */ 196 + continue; 197 + 198 + } 199 + 200 + Applications may receive 2 cmsgs: 201 + 202 + - SCM_DEVMEM_DMABUF: this indicates the fragment landed in the dmabuf indicated 203 + by dmabuf_id. 204 + 205 + - SCM_DEVMEM_LINEAR: this indicates the fragment landed in the linear buffer. 206 + This typically happens when the NIC is unable to split the packet at the 207 + header boundary, such that part (or all) of the payload landed in host 208 + memory. 209 + 210 + Applications may receive no SO_DEVMEM_* cmsgs. That indicates non-devmem, 211 + regular TCP data that landed on an RX queue not bound to a dmabuf. 212 + 213 + 214 + Freeing frags 215 + ------------- 216 + 217 + Frags received via SCM_DEVMEM_DMABUF are pinned by the kernel while the user 218 + processes the frag. The user must return the frag to the kernel via 219 + SO_DEVMEM_DONTNEED:: 220 + 221 + ret = setsockopt(client_fd, SOL_SOCKET, SO_DEVMEM_DONTNEED, &token, 222 + sizeof(token)); 223 + 224 + The user must ensure the tokens are returned to the kernel in a timely manner. 225 + Failure to do so will exhaust the limited dmabuf that is bound to the RX queue 226 + and will lead to packet drops. 227 + 228 + 229 + Implementation & Caveats 230 + ======================== 231 + 232 + Unreadable skbs 233 + --------------- 234 + 235 + Devmem payloads are inaccessible to the kernel processing the packets. This 236 + results in a few quirks for payloads of devmem skbs: 237 + 238 + - Loopback is not functional. Loopback relies on copying the payload, which is 239 + not possible with devmem skbs. 240 + 241 + - Software checksum calculation fails. 242 + 243 + - TCP Dump and bpf can't access devmem packet payloads. 244 + 245 + 246 + Testing 247 + ======= 248 + 249 + More realistic example code can be found in the kernel source under 250 + ``tools/testing/selftests/net/ncdevmem.c`` 251 + 252 + ncdevmem is a devmem TCP netcat. It works very similarly to netcat, but 253 + receives data directly into a udmabuf. 254 + 255 + To run ncdevmem, you need to run it on a server on the machine under test, and 256 + you need to run netcat on a peer to provide the TX data. 257 + 258 + ncdevmem has a validation mode as well that expects a repeating pattern of 259 + incoming data and validates it as such. For example, you can launch 260 + ncdevmem on the server by:: 261 + 262 + ncdevmem -s <server IP> -c <client IP> -f eth1 -d 3 -n 0000:06:00.0 -l \ 263 + -p 5201 -v 7 264 + 265 + On client side, use regular netcat to send TX data to ncdevmem process 266 + on the server:: 267 + 268 + yes $(echo -e \\x01\\x02\\x03\\x04\\x05\\x06) | \ 269 + tr \\n \\0 | head -c 5G | nc <server IP> 5201 -p 5201

+1

Documentation/networking/index.rst

··· 49 49 cdc_mbim 50 50 dccp 51 51 dctcp 52 + devmem 52 53 dns_resolver 53 54 driver 54 55 eql

+6

arch/alpha/include/uapi/asm/socket.h

··· 140 140 #define SO_PASSPIDFD 76 141 141 #define SO_PEERPIDFD 77 142 142 143 + #define SO_DEVMEM_LINEAR 78 144 + #define SCM_DEVMEM_LINEAR SO_DEVMEM_LINEAR 145 + #define SO_DEVMEM_DMABUF 79 146 + #define SCM_DEVMEM_DMABUF SO_DEVMEM_DMABUF 147 + #define SO_DEVMEM_DONTNEED 80 148 + 143 149 #if !defined(__KERNEL__) 144 150 145 151 #if __BITS_PER_LONG == 64

+6

arch/mips/include/uapi/asm/socket.h

··· 151 151 #define SO_PASSPIDFD 76 152 152 #define SO_PEERPIDFD 77 153 153 154 + #define SO_DEVMEM_LINEAR 78 155 + #define SCM_DEVMEM_LINEAR SO_DEVMEM_LINEAR 156 + #define SO_DEVMEM_DMABUF 79 157 + #define SCM_DEVMEM_DMABUF SO_DEVMEM_DMABUF 158 + #define SO_DEVMEM_DONTNEED 80 159 + 154 160 #if !defined(__KERNEL__) 155 161 156 162 #if __BITS_PER_LONG == 64

+6

arch/parisc/include/uapi/asm/socket.h

··· 132 132 #define SO_PASSPIDFD 0x404A 133 133 #define SO_PEERPIDFD 0x404B 134 134 135 + #define SO_DEVMEM_LINEAR 78 136 + #define SCM_DEVMEM_LINEAR SO_DEVMEM_LINEAR 137 + #define SO_DEVMEM_DMABUF 79 138 + #define SCM_DEVMEM_DMABUF SO_DEVMEM_DMABUF 139 + #define SO_DEVMEM_DONTNEED 80 140 + 135 141 #if !defined(__KERNEL__) 136 142 137 143 #if __BITS_PER_LONG == 64

+6

arch/sparc/include/uapi/asm/socket.h

··· 133 133 #define SO_PASSPIDFD 0x0055 134 134 #define SO_PEERPIDFD 0x0056 135 135 136 + #define SO_DEVMEM_LINEAR 0x0057 137 + #define SCM_DEVMEM_LINEAR SO_DEVMEM_LINEAR 138 + #define SO_DEVMEM_DMABUF 0x0058 139 + #define SCM_DEVMEM_DMABUF SO_DEVMEM_DMABUF 140 + #define SO_DEVMEM_DONTNEED 0x0059 141 + 136 142 #if !defined(__KERNEL__) 137 143 138 144

+2

include/linux/netdevice.h

··· 3953 3953 int dev_xdp_propagate(struct net_device *dev, struct netdev_bpf *bpf); 3954 3954 u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode); 3955 3955 3956 + u32 dev_get_min_mp_channel_count(const struct net_device *dev); 3957 + 3956 3958 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb); 3957 3959 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb); 3958 3960 int dev_forward_skb_nomtu(struct net_device *dev, struct sk_buff *skb);

+58 -3

include/linux/skbuff.h

··· 827 827 * @csum_level: indicates the number of consecutive checksums found in 828 828 * the packet minus one that have been verified as 829 829 * CHECKSUM_UNNECESSARY (max 3) 830 + * @unreadable: indicates that at least 1 of the fragments in this skb is 831 + * unreadable. 830 832 * @dst_pending_confirm: need to confirm neighbour 831 833 * @decrypted: Decrypted SKB 832 834 * @slow_gro: state present at GRO time, slower prepare step required ··· 1010 1008 #if IS_ENABLED(CONFIG_IP_SCTP) 1011 1009 __u8 csum_not_inet:1; 1012 1010 #endif 1013 - 1011 + __u8 unreadable:1; 1014 1012 #if defined(CONFIG_NET_SCHED) || defined(CONFIG_NET_XGRESS) 1015 1013 __u16 tc_index; /* traffic control index */ 1016 1014 #endif ··· 1826 1824 __skb_zcopy_downgrade_managed(skb); 1827 1825 } 1828 1826 1827 + /* Return true if frags in this skb are readable by the host. */ 1828 + static inline bool skb_frags_readable(const struct sk_buff *skb) 1829 + { 1830 + return !skb->unreadable; 1831 + } 1832 + 1829 1833 static inline void skb_mark_not_on_list(struct sk_buff *skb) 1830 1834 { 1831 1835 skb->next = NULL; ··· 2548 2540 static inline void __skb_fill_netmem_desc(struct sk_buff *skb, int i, 2549 2541 netmem_ref netmem, int off, int size) 2550 2542 { 2551 - struct page *page = netmem_to_page(netmem); 2543 + struct page *page; 2552 2544 2553 2545 __skb_fill_netmem_desc_noacc(skb_shinfo(skb), i, netmem, off, size); 2546 + 2547 + if (netmem_is_net_iov(netmem)) { 2548 + skb->unreadable = true; 2549 + return; 2550 + } 2551 + 2552 + page = netmem_to_page(netmem); 2554 2553 2555 2554 /* Propagate page pfmemalloc to the skb if we can. The problem is 2556 2555 * that not all callers have unique ownership of the page but rely ··· 3539 3524 fragto->offset = fragfrom->offset; 3540 3525 } 3541 3526 3527 + /* Return: true if the skb_frag contains a net_iov. */ 3528 + static inline bool skb_frag_is_net_iov(const skb_frag_t *frag) 3529 + { 3530 + return netmem_is_net_iov(frag->netmem); 3531 + } 3532 + 3533 + /** 3534 + * skb_frag_net_iov - retrieve the net_iov referred to by fragment 3535 + * @frag: the fragment 3536 + * 3537 + * Return: the &struct net_iov associated with @frag. Returns NULL if this 3538 + * frag has no associated net_iov. 3539 + */ 3540 + static inline struct net_iov *skb_frag_net_iov(const skb_frag_t *frag) 3541 + { 3542 + if (!skb_frag_is_net_iov(frag)) 3543 + return NULL; 3544 + 3545 + return netmem_to_net_iov(frag->netmem); 3546 + } 3547 + 3542 3548 /** 3543 3549 * skb_frag_page - retrieve the page referred to by a paged fragment 3544 3550 * @frag: the paged fragment 3545 3551 * 3546 - * Returns the &struct page associated with @frag. 3552 + * Return: the &struct page associated with @frag. Returns NULL if this frag 3553 + * has no associated page. 3547 3554 */ 3548 3555 static inline struct page *skb_frag_page(const skb_frag_t *frag) 3549 3556 { 3557 + if (skb_frag_is_net_iov(frag)) 3558 + return NULL; 3559 + 3550 3560 return netmem_to_page(frag->netmem); 3561 + } 3562 + 3563 + /** 3564 + * skb_frag_netmem - retrieve the netmem referred to by a fragment 3565 + * @frag: the fragment 3566 + * 3567 + * Return: the &netmem_ref associated with @frag. 3568 + */ 3569 + static inline netmem_ref skb_frag_netmem(const skb_frag_t *frag) 3570 + { 3571 + return frag->netmem; 3551 3572 } 3552 3573 3553 3574 int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb, 3554 3575 unsigned int headroom); 3555 3576 int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb, 3556 3577 struct bpf_prog *prog); 3578 + 3557 3579 /** 3558 3580 * skb_frag_address - gets the address of the data contained in a paged fragment 3559 3581 * @frag: the paged fragment buffer ··· 3600 3548 */ 3601 3549 static inline void *skb_frag_address(const skb_frag_t *frag) 3602 3550 { 3551 + if (!skb_frag_page(frag)) 3552 + return NULL; 3553 + 3603 3554 return page_address(skb_frag_page(frag)) + skb_frag_off(frag); 3604 3555 } 3605 3556

+4 -5

include/linux/skbuff_ref.h

··· 34 34 35 35 bool napi_pp_put_page(netmem_ref netmem); 36 36 37 - static inline void 38 - skb_page_unref(struct page *page, bool recycle) 37 + static inline void skb_page_unref(netmem_ref netmem, bool recycle) 39 38 { 40 39 #ifdef CONFIG_PAGE_POOL 41 - if (recycle && napi_pp_put_page(page_to_netmem(page))) 40 + if (recycle && napi_pp_put_page(netmem)) 42 41 return; 43 42 #endif 44 - put_page(page); 43 + put_page(netmem_to_page(netmem)); 45 44 } 46 45 47 46 /** ··· 53 54 */ 54 55 static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle) 55 56 { 56 - skb_page_unref(skb_frag_page(frag), recycle); 57 + skb_page_unref(skb_frag_netmem(frag), recycle); 57 58 } 58 59 59 60 /**

+1

include/linux/socket.h

··· 327 327 * plain text and require encryption 328 328 */ 329 329 330 + #define MSG_SOCK_DEVMEM 0x2000000 /* Receive devmem skbs as cmsg */ 330 331 #define MSG_ZEROCOPY 0x4000000 /* Use user data in kernel path */ 331 332 #define MSG_SPLICE_PAGES 0x8000000 /* Splice the pages from the iterator in sendmsg() */ 332 333 #define MSG_FASTOPEN 0x20000000 /* Send data in TCP SYN */

+5

include/net/netdev_rx_queue.h

··· 6 6 #include <linux/netdevice.h> 7 7 #include <linux/sysfs.h> 8 8 #include <net/xdp.h> 9 + #include <net/page_pool/types.h> 9 10 10 11 /* This structure contains an instance of an RX queue. */ 11 12 struct netdev_rx_queue { ··· 26 25 * Readers and writers must hold RTNL 27 26 */ 28 27 struct napi_struct *napi; 28 + struct pp_memory_provider_params mp_params; 29 29 } ____cacheline_aligned_in_smp; 30 30 31 31 /* ··· 56 54 return index; 57 55 } 58 56 #endif 57 + 58 + int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq); 59 + 59 60 #endif

+125 -7

include/net/netmem.h

··· 8 8 #ifndef _NET_NETMEM_H 9 9 #define _NET_NETMEM_H 10 10 11 + #include <linux/mm.h> 12 + #include <net/net_debug.h> 13 + 14 + /* net_iov */ 15 + 16 + DECLARE_STATIC_KEY_FALSE(page_pool_mem_providers); 17 + 18 + /* We overload the LSB of the struct page pointer to indicate whether it's 19 + * a page or net_iov. 20 + */ 21 + #define NET_IOV 0x01UL 22 + 23 + struct net_iov { 24 + unsigned long __unused_padding; 25 + unsigned long pp_magic; 26 + struct page_pool *pp; 27 + struct dmabuf_genpool_chunk_owner *owner; 28 + unsigned long dma_addr; 29 + atomic_long_t pp_ref_count; 30 + }; 31 + 32 + /* These fields in struct page are used by the page_pool and net stack: 33 + * 34 + * struct { 35 + * unsigned long pp_magic; 36 + * struct page_pool *pp; 37 + * unsigned long _pp_mapping_pad; 38 + * unsigned long dma_addr; 39 + * atomic_long_t pp_ref_count; 40 + * }; 41 + * 42 + * We mirror the page_pool fields here so the page_pool can access these fields 43 + * without worrying whether the underlying fields belong to a page or net_iov. 44 + * 45 + * The non-net stack fields of struct page are private to the mm stack and must 46 + * never be mirrored to net_iov. 47 + */ 48 + #define NET_IOV_ASSERT_OFFSET(pg, iov) \ 49 + static_assert(offsetof(struct page, pg) == \ 50 + offsetof(struct net_iov, iov)) 51 + NET_IOV_ASSERT_OFFSET(pp_magic, pp_magic); 52 + NET_IOV_ASSERT_OFFSET(pp, pp); 53 + NET_IOV_ASSERT_OFFSET(dma_addr, dma_addr); 54 + NET_IOV_ASSERT_OFFSET(pp_ref_count, pp_ref_count); 55 + #undef NET_IOV_ASSERT_OFFSET 56 + 57 + /* netmem */ 58 + 11 59 /** 12 60 * typedef netmem_ref - a nonexistent type marking a reference to generic 13 61 * network memory. ··· 67 19 */ 68 20 typedef unsigned long __bitwise netmem_ref; 69 21 22 + static inline bool netmem_is_net_iov(const netmem_ref netmem) 23 + { 24 + return (__force unsigned long)netmem & NET_IOV; 25 + } 26 + 70 27 /* This conversion fails (returns NULL) if the netmem_ref is not struct page 71 28 * backed. 72 - * 73 - * Currently struct page is the only possible netmem, and this helper never 74 - * fails. 75 29 */ 76 30 static inline struct page *netmem_to_page(netmem_ref netmem) 77 31 { 32 + if (WARN_ON_ONCE(netmem_is_net_iov(netmem))) 33 + return NULL; 34 + 78 35 return (__force struct page *)netmem; 79 36 } 80 37 81 - /* Converting from page to netmem is always safe, because a page can always be 82 - * a netmem. 83 - */ 38 + static inline struct net_iov *netmem_to_net_iov(netmem_ref netmem) 39 + { 40 + if (netmem_is_net_iov(netmem)) 41 + return (struct net_iov *)((__force unsigned long)netmem & 42 + ~NET_IOV); 43 + 44 + DEBUG_NET_WARN_ON_ONCE(true); 45 + return NULL; 46 + } 47 + 48 + static inline netmem_ref net_iov_to_netmem(struct net_iov *niov) 49 + { 50 + return (__force netmem_ref)((unsigned long)niov | NET_IOV); 51 + } 52 + 84 53 static inline netmem_ref page_to_netmem(struct page *page) 85 54 { 86 55 return (__force netmem_ref)page; ··· 105 40 106 41 static inline int netmem_ref_count(netmem_ref netmem) 107 42 { 43 + /* The non-pp refcount of net_iov is always 1. On net_iov, we only 44 + * support pp refcounting which uses the pp_ref_count field. 45 + */ 46 + if (netmem_is_net_iov(netmem)) 47 + return 1; 48 + 108 49 return page_ref_count(netmem_to_page(netmem)); 109 50 } 110 51 111 - static inline unsigned long netmem_to_pfn(netmem_ref netmem) 52 + static inline unsigned long netmem_pfn_trace(netmem_ref netmem) 112 53 { 54 + if (netmem_is_net_iov(netmem)) 55 + return 0; 56 + 113 57 return page_to_pfn(netmem_to_page(netmem)); 58 + } 59 + 60 + static inline struct net_iov *__netmem_clear_lsb(netmem_ref netmem) 61 + { 62 + return (struct net_iov *)((__force unsigned long)netmem & ~NET_IOV); 63 + } 64 + 65 + static inline struct page_pool *netmem_get_pp(netmem_ref netmem) 66 + { 67 + return __netmem_clear_lsb(netmem)->pp; 68 + } 69 + 70 + static inline atomic_long_t *netmem_get_pp_ref_count_ref(netmem_ref netmem) 71 + { 72 + return &__netmem_clear_lsb(netmem)->pp_ref_count; 73 + } 74 + 75 + static inline bool netmem_is_pref_nid(netmem_ref netmem, int pref_nid) 76 + { 77 + /* NUMA node preference only makes sense if we're allocating 78 + * system memory. Memory providers (which give us net_iovs) 79 + * choose for us. 80 + */ 81 + if (netmem_is_net_iov(netmem)) 82 + return true; 83 + 84 + return page_to_nid(netmem_to_page(netmem)) == pref_nid; 114 85 } 115 86 116 87 static inline netmem_ref netmem_compound_head(netmem_ref netmem) 117 88 { 89 + /* niov are never compounded */ 90 + if (netmem_is_net_iov(netmem)) 91 + return netmem; 92 + 118 93 return page_to_netmem(compound_head(netmem_to_page(netmem))); 94 + } 95 + 96 + static inline void *netmem_address(netmem_ref netmem) 97 + { 98 + if (netmem_is_net_iov(netmem)) 99 + return NULL; 100 + 101 + return page_address(netmem_to_page(netmem)); 102 + } 103 + 104 + static inline unsigned long netmem_get_dma_addr(netmem_ref netmem) 105 + { 106 + return __netmem_clear_lsb(netmem)->dma_addr; 119 107 } 120 108 121 109 #endif /* _NET_NETMEM_H */

+7 -32

include/net/page_pool/helpers.h

··· 216 216 217 217 static inline void page_pool_fragment_netmem(netmem_ref netmem, long nr) 218 218 { 219 - atomic_long_set(&netmem_to_page(netmem)->pp_ref_count, nr); 219 + atomic_long_set(netmem_get_pp_ref_count_ref(netmem), nr); 220 220 } 221 221 222 222 /** ··· 244 244 245 245 static inline long page_pool_unref_netmem(netmem_ref netmem, long nr) 246 246 { 247 - struct page *page = netmem_to_page(netmem); 247 + atomic_long_t *pp_ref_count = netmem_get_pp_ref_count_ref(netmem); 248 248 long ret; 249 249 250 250 /* If nr == pp_ref_count then we have cleared all remaining ··· 261 261 * initially, and only overwrite it when the page is partitioned into 262 262 * more than one piece. 263 263 */ 264 - if (atomic_long_read(&page->pp_ref_count) == nr) { 264 + if (atomic_long_read(pp_ref_count) == nr) { 265 265 /* As we have ensured nr is always one for constant case using 266 266 * the BUILD_BUG_ON(), only need to handle the non-constant case 267 267 * here for pp_ref_count draining, which is a rare case. 268 268 */ 269 269 BUILD_BUG_ON(__builtin_constant_p(nr) && nr != 1); 270 270 if (!__builtin_constant_p(nr)) 271 - atomic_long_set(&page->pp_ref_count, 1); 271 + atomic_long_set(pp_ref_count, 1); 272 272 273 273 return 0; 274 274 } 275 275 276 - ret = atomic_long_sub_return(nr, &page->pp_ref_count); 276 + ret = atomic_long_sub_return(nr, pp_ref_count); 277 277 WARN_ON(ret < 0); 278 278 279 279 /* We are the last user here too, reset pp_ref_count back to 1 to ··· 282 282 * page_pool_unref_page() currently. 283 283 */ 284 284 if (unlikely(!ret)) 285 - atomic_long_set(&page->pp_ref_count, 1); 285 + atomic_long_set(pp_ref_count, 1); 286 286 287 287 return ret; 288 288 } ··· 401 401 402 402 static inline dma_addr_t page_pool_get_dma_addr_netmem(netmem_ref netmem) 403 403 { 404 - struct page *page = netmem_to_page(netmem); 405 - 406 - dma_addr_t ret = page->dma_addr; 404 + dma_addr_t ret = netmem_get_dma_addr(netmem); 407 405 408 406 if (PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA) 409 407 ret <<= PAGE_SHIFT; ··· 419 421 static inline dma_addr_t page_pool_get_dma_addr(const struct page *page) 420 422 { 421 423 return page_pool_get_dma_addr_netmem(page_to_netmem((struct page *)page)); 422 - } 423 - 424 - static inline bool page_pool_set_dma_addr_netmem(netmem_ref netmem, 425 - dma_addr_t addr) 426 - { 427 - struct page *page = netmem_to_page(netmem); 428 - 429 - if (PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA) { 430 - page->dma_addr = addr >> PAGE_SHIFT; 431 - 432 - /* We assume page alignment to shave off bottom bits, 433 - * if this "compression" doesn't work we need to drop. 434 - */ 435 - return addr != (dma_addr_t)page->dma_addr << PAGE_SHIFT; 436 - } 437 - 438 - page->dma_addr = addr; 439 - return false; 440 424 } 441 425 442 426 /** ··· 441 461 page_pool_get_dma_addr(page), 442 462 offset + pool->p.offset, dma_sync_size, 443 463 page_pool_get_dma_dir(pool)); 444 - } 445 - 446 - static inline bool page_pool_set_dma_addr(struct page *page, dma_addr_t addr) 447 - { 448 - return page_pool_set_dma_addr_netmem(page_to_netmem(page), addr); 449 464 } 450 465 451 466 static inline bool page_pool_put(struct page_pool *pool)

+21 -2

include/net/page_pool/types.h

··· 20 20 * device driver responsibility 21 21 */ 22 22 #define PP_FLAG_SYSTEM_POOL BIT(2) /* Global system page_pool */ 23 + 24 + /* Allow unreadable (net_iov backed) netmem in this page_pool. Drivers setting 25 + * this must be able to support unreadable netmem, where netmem_address() would 26 + * return NULL. This flag should not be set for header page_pools. 27 + * 28 + * If the driver sets PP_FLAG_ALLOW_UNREADABLE_NETMEM, it should also set 29 + * page_pool_params.slow.queue_idx. 30 + */ 31 + #define PP_FLAG_ALLOW_UNREADABLE_NETMEM BIT(3) 32 + 23 33 #define PP_FLAG_ALL (PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV | \ 24 - PP_FLAG_SYSTEM_POOL) 34 + PP_FLAG_SYSTEM_POOL | PP_FLAG_ALLOW_UNREADABLE_NETMEM) 25 35 26 36 /* 27 37 * Fast allocation side cache array/stack ··· 67 57 * @offset: DMA sync address offset for PP_FLAG_DMA_SYNC_DEV 68 58 * @slow: params with slowpath access only (initialization and Netlink) 69 59 * @netdev: netdev this pool will serve (leave as NULL if none or multiple) 70 - * @flags: PP_FLAG_DMA_MAP, PP_FLAG_DMA_SYNC_DEV, PP_FLAG_SYSTEM_POOL 60 + * @queue_idx: queue idx this page_pool is being created for. 61 + * @flags: PP_FLAG_DMA_MAP, PP_FLAG_DMA_SYNC_DEV, PP_FLAG_SYSTEM_POOL, 62 + * PP_FLAG_ALLOW_UNREADABLE_NETMEM. 71 63 */ 72 64 struct page_pool_params { 73 65 struct_group_tagged(page_pool_params_fast, fast, ··· 84 72 ); 85 73 struct_group_tagged(page_pool_params_slow, slow, 86 74 struct net_device *netdev; 75 + unsigned int queue_idx; 87 76 unsigned int flags; 88 77 /* private: used by test code only */ 89 78 void (*init_callback)(netmem_ref netmem, void *arg); ··· 152 139 */ 153 140 #define PAGE_POOL_FRAG_GROUP_ALIGN (4 * sizeof(long)) 154 141 142 + struct pp_memory_provider_params { 143 + void *mp_priv; 144 + }; 145 + 155 146 struct page_pool { 156 147 struct page_pool_params_fast p; 157 148 ··· 213 196 * TODO: Implement bulk return pages into this structure. 214 197 */ 215 198 struct ptr_ring ring; 199 + 200 + void *mp_priv; 216 201 217 202 #ifdef CONFIG_PAGE_POOL_STATS 218 203 /* recycle stats are per-cpu to avoid locking */

+2

include/net/sock.h

··· 337 337 * @sk_txtime_report_errors: set report errors mode for SO_TXTIME 338 338 * @sk_txtime_unused: unused txtime flags 339 339 * @ns_tracker: tracker for netns reference 340 + * @sk_user_frags: xarray of pages the user is holding a reference on. 340 341 */ 341 342 struct sock { 342 343 /* ··· 543 542 #endif 544 543 struct rcu_head sk_rcu; 545 544 netns_tracker ns_tracker; 545 + struct xarray sk_user_frags; 546 546 }; 547 547 548 548 struct sock_bh_locked {

+2 -1

include/net/tcp.h

··· 1069 1069 /* skb_cmp_decrypted() not needed, use tcp_write_collapse_fence() */ 1070 1070 return likely(tcp_skb_can_collapse_to(to) && 1071 1071 mptcp_skb_can_collapse(to, from) && 1072 - skb_pure_zcopy_same(to, from)); 1072 + skb_pure_zcopy_same(to, from) && 1073 + skb_frags_readable(to) == skb_frags_readable(from)); 1073 1074 } 1074 1075 1075 1076 static inline bool tcp_skb_can_collapse_rx(const struct sk_buff *to,

+6 -6

include/trace/events/page_pool.h

··· 57 57 __entry->pool = pool; 58 58 __entry->netmem = (__force unsigned long)netmem; 59 59 __entry->release = release; 60 - __entry->pfn = netmem_to_pfn(netmem); 60 + __entry->pfn = netmem_pfn_trace(netmem); 61 61 ), 62 62 63 - TP_printk("page_pool=%p netmem=%p pfn=0x%lx release=%u", 63 + TP_printk("page_pool=%p netmem=%p is_net_iov=%lu pfn=0x%lx release=%u", 64 64 __entry->pool, (void *)__entry->netmem, 65 - __entry->pfn, __entry->release) 65 + __entry->netmem & NET_IOV, __entry->pfn, __entry->release) 66 66 ); 67 67 68 68 TRACE_EVENT(page_pool_state_hold, ··· 83 83 __entry->pool = pool; 84 84 __entry->netmem = (__force unsigned long)netmem; 85 85 __entry->hold = hold; 86 - __entry->pfn = netmem_to_pfn(netmem); 86 + __entry->pfn = netmem_pfn_trace(netmem); 87 87 ), 88 88 89 - TP_printk("page_pool=%p netmem=%p pfn=0x%lx hold=%u", 89 + TP_printk("page_pool=%p netmem=%p is_net_iov=%lu, pfn=0x%lx hold=%u", 90 90 __entry->pool, (void *)__entry->netmem, 91 - __entry->pfn, __entry->hold) 91 + __entry->netmem & NET_IOV, __entry->pfn, __entry->hold) 92 92 ); 93 93 94 94 TRACE_EVENT(page_pool_update_nid,

+6

include/uapi/asm-generic/socket.h

··· 135 135 #define SO_PASSPIDFD 76 136 136 #define SO_PEERPIDFD 77 137 137 138 + #define SO_DEVMEM_LINEAR 78 139 + #define SCM_DEVMEM_LINEAR SO_DEVMEM_LINEAR 140 + #define SO_DEVMEM_DMABUF 79 141 + #define SCM_DEVMEM_DMABUF SO_DEVMEM_DMABUF 142 + #define SO_DEVMEM_DONTNEED 80 143 + 138 144 #if !defined(__KERNEL__) 139 145 140 146 #if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__))

+13

include/uapi/linux/netdev.h

··· 93 93 NETDEV_A_PAGE_POOL_INFLIGHT, 94 94 NETDEV_A_PAGE_POOL_INFLIGHT_MEM, 95 95 NETDEV_A_PAGE_POOL_DETACH_TIME, 96 + NETDEV_A_PAGE_POOL_DMABUF, 96 97 97 98 __NETDEV_A_PAGE_POOL_MAX, 98 99 NETDEV_A_PAGE_POOL_MAX = (__NETDEV_A_PAGE_POOL_MAX - 1) ··· 132 131 NETDEV_A_QUEUE_IFINDEX, 133 132 NETDEV_A_QUEUE_TYPE, 134 133 NETDEV_A_QUEUE_NAPI_ID, 134 + NETDEV_A_QUEUE_DMABUF, 135 135 136 136 __NETDEV_A_QUEUE_MAX, 137 137 NETDEV_A_QUEUE_MAX = (__NETDEV_A_QUEUE_MAX - 1) ··· 176 174 }; 177 175 178 176 enum { 177 + NETDEV_A_DMABUF_IFINDEX = 1, 178 + NETDEV_A_DMABUF_QUEUES, 179 + NETDEV_A_DMABUF_FD, 180 + NETDEV_A_DMABUF_ID, 181 + 182 + __NETDEV_A_DMABUF_MAX, 183 + NETDEV_A_DMABUF_MAX = (__NETDEV_A_DMABUF_MAX - 1) 184 + }; 185 + 186 + enum { 179 187 NETDEV_CMD_DEV_GET = 1, 180 188 NETDEV_CMD_DEV_ADD_NTF, 181 189 NETDEV_CMD_DEV_DEL_NTF, ··· 198 186 NETDEV_CMD_QUEUE_GET, 199 187 NETDEV_CMD_NAPI_GET, 200 188 NETDEV_CMD_QSTATS_GET, 189 + NETDEV_CMD_BIND_RX, 201 190 202 191 __NETDEV_CMD_MAX, 203 192 NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1)

+18

include/uapi/linux/uio.h

··· 20 20 __kernel_size_t iov_len; /* Must be size_t (1003.1g) */ 21 21 }; 22 22 23 + struct dmabuf_cmsg { 24 + __u64 frag_offset; /* offset into the dmabuf where the frag starts. 25 + */ 26 + __u32 frag_size; /* size of the frag. */ 27 + __u32 frag_token; /* token representing this frag for 28 + * DEVMEM_DONTNEED. 29 + */ 30 + __u32 dmabuf_id; /* dmabuf id this frag belongs to. */ 31 + __u32 flags; /* Currently unused. Reserved for future 32 + * uses. 33 + */ 34 + }; 35 + 36 + struct dmabuf_token { 37 + __u32 token_start; 38 + __u32 token_count; 39 + }; 40 + 23 41 /* 24 42 * UIO_MAXIOV shall be at least 16 1003.1g (5.4.1.1) 25 43 */

+5

net/Kconfig

··· 66 66 config SKB_EXTENSIONS 67 67 bool 68 68 69 + config NET_DEVMEM 70 + def_bool y 71 + depends on DMA_SHARED_BUFFER 72 + depends on GENERIC_ALLOCATOR 73 + 69 74 menu "Networking options" 70 75 71 76 source "net/packet/Kconfig"

+2

net/core/Makefile

··· 19 19 20 20 obj-y += net-sysfs.o 21 21 obj-y += hotdata.o 22 + obj-y += netdev_rx_queue.o 22 23 obj-$(CONFIG_PAGE_POOL) += page_pool.o page_pool_user.o 23 24 obj-$(CONFIG_PROC_FS) += net-procfs.o 24 25 obj-$(CONFIG_NET_PKTGEN) += pktgen.o ··· 44 43 obj-$(CONFIG_BPF_SYSCALL) += bpf_sk_storage.o 45 44 obj-$(CONFIG_OF) += of_net.o 46 45 obj-$(CONFIG_NET_TEST) += net_test.o 46 + obj-$(CONFIG_NET_DEVMEM) += devmem.o

+6

net/core/datagram.c

··· 407 407 return 0; 408 408 } 409 409 410 + if (!skb_frags_readable(skb)) 411 + goto short_copy; 412 + 410 413 /* Copy paged appendix. Hmm... why does this look so complicated? */ 411 414 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 412 415 int end; ··· 625 622 struct iov_iter *from, size_t length) 626 623 { 627 624 int frag = skb_shinfo(skb)->nr_frags; 625 + 626 + if (!skb_frags_readable(skb)) 627 + return -EFAULT; 628 628 629 629 while (length && iov_iter_count(from)) { 630 630 struct page *head, *last_head = NULL;

+32 -1

net/core/dev.c

··· 161 161 #include <linux/phy_link_topology.h> 162 162 163 163 #include "dev.h" 164 + #include "devmem.h" 164 165 #include "net-sysfs.h" 165 166 166 167 static DEFINE_SPINLOCK(ptype_lock); ··· 3312 3311 return -EINVAL; 3313 3312 } 3314 3313 3314 + if (!skb_frags_readable(skb)) { 3315 + return -EFAULT; 3316 + } 3317 + 3315 3318 /* Before computing a checksum, we should make sure no frag could 3316 3319 * be modified by an external entity : checksum could be wrong. 3317 3320 */ ··· 3438 3433 if (!(dev->features & NETIF_F_HIGHDMA)) { 3439 3434 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 3440 3435 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 3436 + struct page *page = skb_frag_page(frag); 3441 3437 3442 - if (PageHighMem(skb_frag_page(frag))) 3438 + if (page && PageHighMem(page)) 3443 3439 return 1; 3444 3440 } 3445 3441 } ··· 9373 9367 if (!dev->netdev_ops->ndo_bpf) 9374 9368 return -EOPNOTSUPP; 9375 9369 9370 + if (dev_get_min_mp_channel_count(dev)) { 9371 + NL_SET_ERR_MSG(bpf->extack, "unable to propagate XDP to device using memory provider"); 9372 + return -EBUSY; 9373 + } 9374 + 9376 9375 return dev->netdev_ops->ndo_bpf(dev, bpf); 9377 9376 } 9378 9377 EXPORT_SYMBOL_GPL(dev_xdp_propagate); ··· 9409 9398 { 9410 9399 struct netdev_bpf xdp; 9411 9400 int err; 9401 + 9402 + if (dev_get_min_mp_channel_count(dev)) { 9403 + NL_SET_ERR_MSG(extack, "unable to install XDP to device using memory provider"); 9404 + return -EBUSY; 9405 + } 9412 9406 9413 9407 memset(&xdp, 0, sizeof(xdp)); 9414 9408 xdp.command = mode == XDP_MODE_HW ? XDP_SETUP_PROG_HW : XDP_SETUP_PROG; ··· 9837 9821 if (old_prog) 9838 9822 bpf_prog_put(old_prog); 9839 9823 return err; 9824 + } 9825 + 9826 + u32 dev_get_min_mp_channel_count(const struct net_device *dev) 9827 + { 9828 + int i; 9829 + 9830 + ASSERT_RTNL(); 9831 + 9832 + for (i = dev->real_num_rx_queues - 1; i >= 0; i--) 9833 + if (dev->_rx[i].mp_params.mp_priv) 9834 + /* The channel count is the idx plus 1. */ 9835 + return i + 1; 9836 + 9837 + return 0; 9840 9838 } 9841 9839 9842 9840 /** ··· 11389 11359 dev_tcx_uninstall(dev); 11390 11360 dev_xdp_uninstall(dev); 11391 11361 bpf_dev_bound_netdev_unregister(dev); 11362 + dev_dmabuf_uninstall(dev); 11392 11363 11393 11364 netdev_offload_xstats_disable_all(dev); 11394 11365

+389

net/core/devmem.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-or-later 2 + /* 3 + * Devmem TCP 4 + * 5 + * Authors: Mina Almasry <almasrymina@google.com> 6 + * Willem de Bruijn <willemdebruijn.kernel@gmail.com> 7 + * Kaiyuan Zhang <kaiyuanz@google.com 8 + */ 9 + 10 + #include <linux/dma-buf.h> 11 + #include <linux/genalloc.h> 12 + #include <linux/mm.h> 13 + #include <linux/netdevice.h> 14 + #include <linux/types.h> 15 + #include <net/netdev_queues.h> 16 + #include <net/netdev_rx_queue.h> 17 + #include <net/page_pool/helpers.h> 18 + #include <trace/events/page_pool.h> 19 + 20 + #include "devmem.h" 21 + #include "mp_dmabuf_devmem.h" 22 + #include "page_pool_priv.h" 23 + 24 + /* Device memory support */ 25 + 26 + /* Protected by rtnl_lock() */ 27 + static DEFINE_XARRAY_FLAGS(net_devmem_dmabuf_bindings, XA_FLAGS_ALLOC1); 28 + 29 + static void net_devmem_dmabuf_free_chunk_owner(struct gen_pool *genpool, 30 + struct gen_pool_chunk *chunk, 31 + void *not_used) 32 + { 33 + struct dmabuf_genpool_chunk_owner *owner = chunk->owner; 34 + 35 + kvfree(owner->niovs); 36 + kfree(owner); 37 + } 38 + 39 + static dma_addr_t net_devmem_get_dma_addr(const struct net_iov *niov) 40 + { 41 + struct dmabuf_genpool_chunk_owner *owner = net_iov_owner(niov); 42 + 43 + return owner->base_dma_addr + 44 + ((dma_addr_t)net_iov_idx(niov) << PAGE_SHIFT); 45 + } 46 + 47 + void __net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding) 48 + { 49 + size_t size, avail; 50 + 51 + gen_pool_for_each_chunk(binding->chunk_pool, 52 + net_devmem_dmabuf_free_chunk_owner, NULL); 53 + 54 + size = gen_pool_size(binding->chunk_pool); 55 + avail = gen_pool_avail(binding->chunk_pool); 56 + 57 + if (!WARN(size != avail, "can't destroy genpool. size=%zu, avail=%zu", 58 + size, avail)) 59 + gen_pool_destroy(binding->chunk_pool); 60 + 61 + dma_buf_unmap_attachment_unlocked(binding->attachment, binding->sgt, 62 + DMA_FROM_DEVICE); 63 + dma_buf_detach(binding->dmabuf, binding->attachment); 64 + dma_buf_put(binding->dmabuf); 65 + xa_destroy(&binding->bound_rxqs); 66 + kfree(binding); 67 + } 68 + 69 + struct net_iov * 70 + net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding) 71 + { 72 + struct dmabuf_genpool_chunk_owner *owner; 73 + unsigned long dma_addr; 74 + struct net_iov *niov; 75 + ssize_t offset; 76 + ssize_t index; 77 + 78 + dma_addr = gen_pool_alloc_owner(binding->chunk_pool, PAGE_SIZE, 79 + (void **)&owner); 80 + if (!dma_addr) 81 + return NULL; 82 + 83 + offset = dma_addr - owner->base_dma_addr; 84 + index = offset / PAGE_SIZE; 85 + niov = &owner->niovs[index]; 86 + 87 + niov->pp_magic = 0; 88 + niov->pp = NULL; 89 + atomic_long_set(&niov->pp_ref_count, 0); 90 + 91 + return niov; 92 + } 93 + 94 + void net_devmem_free_dmabuf(struct net_iov *niov) 95 + { 96 + struct net_devmem_dmabuf_binding *binding = net_iov_binding(niov); 97 + unsigned long dma_addr = net_devmem_get_dma_addr(niov); 98 + 99 + if (WARN_ON(!gen_pool_has_addr(binding->chunk_pool, dma_addr, 100 + PAGE_SIZE))) 101 + return; 102 + 103 + gen_pool_free(binding->chunk_pool, dma_addr, PAGE_SIZE); 104 + } 105 + 106 + void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding) 107 + { 108 + struct netdev_rx_queue *rxq; 109 + unsigned long xa_idx; 110 + unsigned int rxq_idx; 111 + 112 + if (binding->list.next) 113 + list_del(&binding->list); 114 + 115 + xa_for_each(&binding->bound_rxqs, xa_idx, rxq) { 116 + WARN_ON(rxq->mp_params.mp_priv != binding); 117 + 118 + rxq->mp_params.mp_priv = NULL; 119 + 120 + rxq_idx = get_netdev_rx_queue_index(rxq); 121 + 122 + WARN_ON(netdev_rx_queue_restart(binding->dev, rxq_idx)); 123 + } 124 + 125 + xa_erase(&net_devmem_dmabuf_bindings, binding->id); 126 + 127 + net_devmem_dmabuf_binding_put(binding); 128 + } 129 + 130 + int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, 131 + struct net_devmem_dmabuf_binding *binding, 132 + struct netlink_ext_ack *extack) 133 + { 134 + struct netdev_rx_queue *rxq; 135 + u32 xa_idx; 136 + int err; 137 + 138 + if (rxq_idx >= dev->real_num_rx_queues) { 139 + NL_SET_ERR_MSG(extack, "rx queue index out of range"); 140 + return -ERANGE; 141 + } 142 + 143 + rxq = __netif_get_rx_queue(dev, rxq_idx); 144 + if (rxq->mp_params.mp_priv) { 145 + NL_SET_ERR_MSG(extack, "designated queue already memory provider bound"); 146 + return -EEXIST; 147 + } 148 + 149 + #ifdef CONFIG_XDP_SOCKETS 150 + if (rxq->pool) { 151 + NL_SET_ERR_MSG(extack, "designated queue already in use by AF_XDP"); 152 + return -EBUSY; 153 + } 154 + #endif 155 + 156 + err = xa_alloc(&binding->bound_rxqs, &xa_idx, rxq, xa_limit_32b, 157 + GFP_KERNEL); 158 + if (err) 159 + return err; 160 + 161 + rxq->mp_params.mp_priv = binding; 162 + 163 + err = netdev_rx_queue_restart(dev, rxq_idx); 164 + if (err) 165 + goto err_xa_erase; 166 + 167 + return 0; 168 + 169 + err_xa_erase: 170 + rxq->mp_params.mp_priv = NULL; 171 + xa_erase(&binding->bound_rxqs, xa_idx); 172 + 173 + return err; 174 + } 175 + 176 + struct net_devmem_dmabuf_binding * 177 + net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, 178 + struct netlink_ext_ack *extack) 179 + { 180 + struct net_devmem_dmabuf_binding *binding; 181 + static u32 id_alloc_next; 182 + struct scatterlist *sg; 183 + struct dma_buf *dmabuf; 184 + unsigned int sg_idx, i; 185 + unsigned long virtual; 186 + int err; 187 + 188 + dmabuf = dma_buf_get(dmabuf_fd); 189 + if (IS_ERR(dmabuf)) 190 + return ERR_CAST(dmabuf); 191 + 192 + binding = kzalloc_node(sizeof(*binding), GFP_KERNEL, 193 + dev_to_node(&dev->dev)); 194 + if (!binding) { 195 + err = -ENOMEM; 196 + goto err_put_dmabuf; 197 + } 198 + 199 + binding->dev = dev; 200 + 201 + err = xa_alloc_cyclic(&net_devmem_dmabuf_bindings, &binding->id, 202 + binding, xa_limit_32b, &id_alloc_next, 203 + GFP_KERNEL); 204 + if (err < 0) 205 + goto err_free_binding; 206 + 207 + xa_init_flags(&binding->bound_rxqs, XA_FLAGS_ALLOC); 208 + 209 + refcount_set(&binding->ref, 1); 210 + 211 + binding->dmabuf = dmabuf; 212 + 213 + binding->attachment = dma_buf_attach(binding->dmabuf, dev->dev.parent); 214 + if (IS_ERR(binding->attachment)) { 215 + err = PTR_ERR(binding->attachment); 216 + NL_SET_ERR_MSG(extack, "Failed to bind dmabuf to device"); 217 + goto err_free_id; 218 + } 219 + 220 + binding->sgt = dma_buf_map_attachment_unlocked(binding->attachment, 221 + DMA_FROM_DEVICE); 222 + if (IS_ERR(binding->sgt)) { 223 + err = PTR_ERR(binding->sgt); 224 + NL_SET_ERR_MSG(extack, "Failed to map dmabuf attachment"); 225 + goto err_detach; 226 + } 227 + 228 + /* For simplicity we expect to make PAGE_SIZE allocations, but the 229 + * binding can be much more flexible than that. We may be able to 230 + * allocate MTU sized chunks here. Leave that for future work... 231 + */ 232 + binding->chunk_pool = 233 + gen_pool_create(PAGE_SHIFT, dev_to_node(&dev->dev)); 234 + if (!binding->chunk_pool) { 235 + err = -ENOMEM; 236 + goto err_unmap; 237 + } 238 + 239 + virtual = 0; 240 + for_each_sgtable_dma_sg(binding->sgt, sg, sg_idx) { 241 + dma_addr_t dma_addr = sg_dma_address(sg); 242 + struct dmabuf_genpool_chunk_owner *owner; 243 + size_t len = sg_dma_len(sg); 244 + struct net_iov *niov; 245 + 246 + owner = kzalloc_node(sizeof(*owner), GFP_KERNEL, 247 + dev_to_node(&dev->dev)); 248 + if (!owner) { 249 + err = -ENOMEM; 250 + goto err_free_chunks; 251 + } 252 + 253 + owner->base_virtual = virtual; 254 + owner->base_dma_addr = dma_addr; 255 + owner->num_niovs = len / PAGE_SIZE; 256 + owner->binding = binding; 257 + 258 + err = gen_pool_add_owner(binding->chunk_pool, dma_addr, 259 + dma_addr, len, dev_to_node(&dev->dev), 260 + owner); 261 + if (err) { 262 + kfree(owner); 263 + err = -EINVAL; 264 + goto err_free_chunks; 265 + } 266 + 267 + owner->niovs = kvmalloc_array(owner->num_niovs, 268 + sizeof(*owner->niovs), 269 + GFP_KERNEL); 270 + if (!owner->niovs) { 271 + err = -ENOMEM; 272 + goto err_free_chunks; 273 + } 274 + 275 + for (i = 0; i < owner->num_niovs; i++) { 276 + niov = &owner->niovs[i]; 277 + niov->owner = owner; 278 + page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov), 279 + net_devmem_get_dma_addr(niov)); 280 + } 281 + 282 + virtual += len; 283 + } 284 + 285 + return binding; 286 + 287 + err_free_chunks: 288 + gen_pool_for_each_chunk(binding->chunk_pool, 289 + net_devmem_dmabuf_free_chunk_owner, NULL); 290 + gen_pool_destroy(binding->chunk_pool); 291 + err_unmap: 292 + dma_buf_unmap_attachment_unlocked(binding->attachment, binding->sgt, 293 + DMA_FROM_DEVICE); 294 + err_detach: 295 + dma_buf_detach(dmabuf, binding->attachment); 296 + err_free_id: 297 + xa_erase(&net_devmem_dmabuf_bindings, binding->id); 298 + err_free_binding: 299 + kfree(binding); 300 + err_put_dmabuf: 301 + dma_buf_put(dmabuf); 302 + return ERR_PTR(err); 303 + } 304 + 305 + void dev_dmabuf_uninstall(struct net_device *dev) 306 + { 307 + struct net_devmem_dmabuf_binding *binding; 308 + struct netdev_rx_queue *rxq; 309 + unsigned long xa_idx; 310 + unsigned int i; 311 + 312 + for (i = 0; i < dev->real_num_rx_queues; i++) { 313 + binding = dev->_rx[i].mp_params.mp_priv; 314 + if (!binding) 315 + continue; 316 + 317 + xa_for_each(&binding->bound_rxqs, xa_idx, rxq) 318 + if (rxq == &dev->_rx[i]) { 319 + xa_erase(&binding->bound_rxqs, xa_idx); 320 + break; 321 + } 322 + } 323 + } 324 + 325 + /*** "Dmabuf devmem memory provider" ***/ 326 + 327 + int mp_dmabuf_devmem_init(struct page_pool *pool) 328 + { 329 + struct net_devmem_dmabuf_binding *binding = pool->mp_priv; 330 + 331 + if (!binding) 332 + return -EINVAL; 333 + 334 + if (!pool->dma_map) 335 + return -EOPNOTSUPP; 336 + 337 + if (pool->dma_sync) 338 + return -EOPNOTSUPP; 339 + 340 + if (pool->p.order != 0) 341 + return -E2BIG; 342 + 343 + net_devmem_dmabuf_binding_get(binding); 344 + return 0; 345 + } 346 + 347 + netmem_ref mp_dmabuf_devmem_alloc_netmems(struct page_pool *pool, gfp_t gfp) 348 + { 349 + struct net_devmem_dmabuf_binding *binding = pool->mp_priv; 350 + struct net_iov *niov; 351 + netmem_ref netmem; 352 + 353 + niov = net_devmem_alloc_dmabuf(binding); 354 + if (!niov) 355 + return 0; 356 + 357 + netmem = net_iov_to_netmem(niov); 358 + 359 + page_pool_set_pp_info(pool, netmem); 360 + 361 + pool->pages_state_hold_cnt++; 362 + trace_page_pool_state_hold(pool, netmem, pool->pages_state_hold_cnt); 363 + return netmem; 364 + } 365 + 366 + void mp_dmabuf_devmem_destroy(struct page_pool *pool) 367 + { 368 + struct net_devmem_dmabuf_binding *binding = pool->mp_priv; 369 + 370 + net_devmem_dmabuf_binding_put(binding); 371 + } 372 + 373 + bool mp_dmabuf_devmem_release_page(struct page_pool *pool, netmem_ref netmem) 374 + { 375 + long refcount = atomic_long_read(netmem_get_pp_ref_count_ref(netmem)); 376 + 377 + if (WARN_ON_ONCE(!netmem_is_net_iov(netmem))) 378 + return false; 379 + 380 + if (WARN_ON_ONCE(refcount != 1)) 381 + return false; 382 + 383 + page_pool_clear_pp_info(netmem); 384 + 385 + net_devmem_free_dmabuf(netmem_to_net_iov(netmem)); 386 + 387 + /* We don't want the page pool put_page()ing our net_iovs. */ 388 + return false; 389 + }

+180

net/core/devmem.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 + /* 3 + * Device memory TCP support 4 + * 5 + * Authors: Mina Almasry <almasrymina@google.com> 6 + * Willem de Bruijn <willemb@google.com> 7 + * Kaiyuan Zhang <kaiyuanz@google.com> 8 + * 9 + */ 10 + #ifndef _NET_DEVMEM_H 11 + #define _NET_DEVMEM_H 12 + 13 + struct netlink_ext_ack; 14 + 15 + struct net_devmem_dmabuf_binding { 16 + struct dma_buf *dmabuf; 17 + struct dma_buf_attachment *attachment; 18 + struct sg_table *sgt; 19 + struct net_device *dev; 20 + struct gen_pool *chunk_pool; 21 + 22 + /* The user holds a ref (via the netlink API) for as long as they want 23 + * the binding to remain alive. Each page pool using this binding holds 24 + * a ref to keep the binding alive. Each allocated net_iov holds a 25 + * ref. 26 + * 27 + * The binding undos itself and unmaps the underlying dmabuf once all 28 + * those refs are dropped and the binding is no longer desired or in 29 + * use. 30 + */ 31 + refcount_t ref; 32 + 33 + /* The list of bindings currently active. Used for netlink to notify us 34 + * of the user dropping the bind. 35 + */ 36 + struct list_head list; 37 + 38 + /* rxq's this binding is active on. */ 39 + struct xarray bound_rxqs; 40 + 41 + /* ID of this binding. Globally unique to all bindings currently 42 + * active. 43 + */ 44 + u32 id; 45 + }; 46 + 47 + #if defined(CONFIG_NET_DEVMEM) 48 + /* Owner of the dma-buf chunks inserted into the gen pool. Each scatterlist 49 + * entry from the dmabuf is inserted into the genpool as a chunk, and needs 50 + * this owner struct to keep track of some metadata necessary to create 51 + * allocations from this chunk. 52 + */ 53 + struct dmabuf_genpool_chunk_owner { 54 + /* Offset into the dma-buf where this chunk starts. */ 55 + unsigned long base_virtual; 56 + 57 + /* dma_addr of the start of the chunk. */ 58 + dma_addr_t base_dma_addr; 59 + 60 + /* Array of net_iovs for this chunk. */ 61 + struct net_iov *niovs; 62 + size_t num_niovs; 63 + 64 + struct net_devmem_dmabuf_binding *binding; 65 + }; 66 + 67 + void __net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding); 68 + struct net_devmem_dmabuf_binding * 69 + net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, 70 + struct netlink_ext_ack *extack); 71 + void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding); 72 + int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, 73 + struct net_devmem_dmabuf_binding *binding, 74 + struct netlink_ext_ack *extack); 75 + void dev_dmabuf_uninstall(struct net_device *dev); 76 + 77 + static inline struct dmabuf_genpool_chunk_owner * 78 + net_iov_owner(const struct net_iov *niov) 79 + { 80 + return niov->owner; 81 + } 82 + 83 + static inline unsigned int net_iov_idx(const struct net_iov *niov) 84 + { 85 + return niov - net_iov_owner(niov)->niovs; 86 + } 87 + 88 + static inline struct net_devmem_dmabuf_binding * 89 + net_iov_binding(const struct net_iov *niov) 90 + { 91 + return net_iov_owner(niov)->binding; 92 + } 93 + 94 + static inline unsigned long net_iov_virtual_addr(const struct net_iov *niov) 95 + { 96 + struct dmabuf_genpool_chunk_owner *owner = net_iov_owner(niov); 97 + 98 + return owner->base_virtual + 99 + ((unsigned long)net_iov_idx(niov) << PAGE_SHIFT); 100 + } 101 + 102 + static inline u32 net_iov_binding_id(const struct net_iov *niov) 103 + { 104 + return net_iov_owner(niov)->binding->id; 105 + } 106 + 107 + static inline void 108 + net_devmem_dmabuf_binding_get(struct net_devmem_dmabuf_binding *binding) 109 + { 110 + refcount_inc(&binding->ref); 111 + } 112 + 113 + static inline void 114 + net_devmem_dmabuf_binding_put(struct net_devmem_dmabuf_binding *binding) 115 + { 116 + if (!refcount_dec_and_test(&binding->ref)) 117 + return; 118 + 119 + __net_devmem_dmabuf_binding_free(binding); 120 + } 121 + 122 + struct net_iov * 123 + net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding); 124 + void net_devmem_free_dmabuf(struct net_iov *ppiov); 125 + 126 + #else 127 + struct net_devmem_dmabuf_binding; 128 + 129 + static inline void 130 + __net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding) 131 + { 132 + } 133 + 134 + static inline struct net_devmem_dmabuf_binding * 135 + net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, 136 + struct netlink_ext_ack *extack) 137 + { 138 + return ERR_PTR(-EOPNOTSUPP); 139 + } 140 + 141 + static inline void 142 + net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding) 143 + { 144 + } 145 + 146 + static inline int 147 + net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, 148 + struct net_devmem_dmabuf_binding *binding, 149 + struct netlink_ext_ack *extack) 150 + 151 + { 152 + return -EOPNOTSUPP; 153 + } 154 + 155 + static inline void dev_dmabuf_uninstall(struct net_device *dev) 156 + { 157 + } 158 + 159 + static inline struct net_iov * 160 + net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding) 161 + { 162 + return NULL; 163 + } 164 + 165 + static inline void net_devmem_free_dmabuf(struct net_iov *ppiov) 166 + { 167 + } 168 + 169 + static inline unsigned long net_iov_virtual_addr(const struct net_iov *niov) 170 + { 171 + return 0; 172 + } 173 + 174 + static inline u32 net_iov_binding_id(const struct net_iov *niov) 175 + { 176 + return 0; 177 + } 178 + #endif 179 + 180 + #endif /* _NET_DEVMEM_H */

+2 -1

net/core/gro.c

··· 408 408 pinfo = skb_shinfo(skb); 409 409 frag0 = &pinfo->frags[0]; 410 410 411 - if (pinfo->nr_frags && !PageHighMem(skb_frag_page(frag0)) && 411 + if (pinfo->nr_frags && skb_frag_page(frag0) && 412 + !PageHighMem(skb_frag_page(frag0)) && 412 413 (!NET_IP_ALIGN || !((skb_frag_off(frag0) + nhoff) & 3))) { 413 414 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); 414 415 NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,

+44

net/core/mp_dmabuf_devmem.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 + /* 3 + * Dmabuf device memory provider. 4 + * 5 + * Authors: Mina Almasry <almasrymina@google.com> 6 + * 7 + */ 8 + #ifndef _NET_MP_DMABUF_DEVMEM_H 9 + #define _NET_MP_DMABUF_DEVMEM_H 10 + 11 + #include <net/netmem.h> 12 + 13 + #if defined(CONFIG_NET_DEVMEM) 14 + int mp_dmabuf_devmem_init(struct page_pool *pool); 15 + 16 + netmem_ref mp_dmabuf_devmem_alloc_netmems(struct page_pool *pool, gfp_t gfp); 17 + 18 + void mp_dmabuf_devmem_destroy(struct page_pool *pool); 19 + 20 + bool mp_dmabuf_devmem_release_page(struct page_pool *pool, netmem_ref netmem); 21 + #else 22 + static inline int mp_dmabuf_devmem_init(struct page_pool *pool) 23 + { 24 + return -EOPNOTSUPP; 25 + } 26 + 27 + static inline netmem_ref 28 + mp_dmabuf_devmem_alloc_netmems(struct page_pool *pool, gfp_t gfp) 29 + { 30 + return 0; 31 + } 32 + 33 + static inline void mp_dmabuf_devmem_destroy(struct page_pool *pool) 34 + { 35 + } 36 + 37 + static inline bool 38 + mp_dmabuf_devmem_release_page(struct page_pool *pool, netmem_ref netmem) 39 + { 40 + return false; 41 + } 42 + #endif 43 + 44 + #endif /* _NET_MP_DMABUF_DEVMEM_H */

+23

net/core/netdev-genl-gen.c

··· 9 9 #include "netdev-genl-gen.h" 10 10 11 11 #include <uapi/linux/netdev.h> 12 + #include <linux/list.h> 12 13 13 14 /* Integer value ranges */ 14 15 static const struct netlink_range_validation netdev_a_page_pool_id_range = { ··· 26 25 const struct nla_policy netdev_page_pool_info_nl_policy[NETDEV_A_PAGE_POOL_IFINDEX + 1] = { 27 26 [NETDEV_A_PAGE_POOL_ID] = NLA_POLICY_FULL_RANGE(NLA_UINT, &netdev_a_page_pool_id_range), 28 27 [NETDEV_A_PAGE_POOL_IFINDEX] = NLA_POLICY_FULL_RANGE(NLA_U32, &netdev_a_page_pool_ifindex_range), 28 + }; 29 + 30 + const struct nla_policy netdev_queue_id_nl_policy[NETDEV_A_QUEUE_TYPE + 1] = { 31 + [NETDEV_A_QUEUE_ID] = { .type = NLA_U32, }, 32 + [NETDEV_A_QUEUE_TYPE] = NLA_POLICY_MAX(NLA_U32, 1), 29 33 }; 30 34 31 35 /* NETDEV_CMD_DEV_GET - do */ ··· 78 72 static const struct nla_policy netdev_qstats_get_nl_policy[NETDEV_A_QSTATS_SCOPE + 1] = { 79 73 [NETDEV_A_QSTATS_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), 80 74 [NETDEV_A_QSTATS_SCOPE] = NLA_POLICY_MASK(NLA_UINT, 0x1), 75 + }; 76 + 77 + /* NETDEV_CMD_BIND_RX - do */ 78 + static const struct nla_policy netdev_bind_rx_nl_policy[NETDEV_A_DMABUF_FD + 1] = { 79 + [NETDEV_A_DMABUF_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), 80 + [NETDEV_A_DMABUF_FD] = { .type = NLA_U32, }, 81 + [NETDEV_A_DMABUF_QUEUES] = NLA_POLICY_NESTED(netdev_queue_id_nl_policy), 81 82 }; 82 83 83 84 /* Ops table for netdev */ ··· 164 151 .maxattr = NETDEV_A_QSTATS_SCOPE, 165 152 .flags = GENL_CMD_CAP_DUMP, 166 153 }, 154 + { 155 + .cmd = NETDEV_CMD_BIND_RX, 156 + .doit = netdev_nl_bind_rx_doit, 157 + .policy = netdev_bind_rx_nl_policy, 158 + .maxattr = NETDEV_A_DMABUF_FD, 159 + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, 160 + }, 167 161 }; 168 162 169 163 static const struct genl_multicast_group netdev_nl_mcgrps[] = { ··· 188 168 .n_split_ops = ARRAY_SIZE(netdev_nl_ops), 189 169 .mcgrps = netdev_nl_mcgrps, 190 170 .n_mcgrps = ARRAY_SIZE(netdev_nl_mcgrps), 171 + .sock_priv_size = sizeof(struct list_head), 172 + .sock_priv_init = (void *)netdev_nl_sock_priv_init, 173 + .sock_priv_destroy = (void *)netdev_nl_sock_priv_destroy, 191 174 };

+6

net/core/netdev-genl-gen.h

··· 10 10 #include <net/genetlink.h> 11 11 12 12 #include <uapi/linux/netdev.h> 13 + #include <linux/list.h> 13 14 14 15 /* Common nested types */ 15 16 extern const struct nla_policy netdev_page_pool_info_nl_policy[NETDEV_A_PAGE_POOL_IFINDEX + 1]; 17 + extern const struct nla_policy netdev_queue_id_nl_policy[NETDEV_A_QUEUE_TYPE + 1]; 16 18 17 19 int netdev_nl_dev_get_doit(struct sk_buff *skb, struct genl_info *info); 18 20 int netdev_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); ··· 32 30 int netdev_nl_napi_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); 33 31 int netdev_nl_qstats_get_dumpit(struct sk_buff *skb, 34 32 struct netlink_callback *cb); 33 + int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info); 35 34 36 35 enum { 37 36 NETDEV_NLGRP_MGMT, ··· 40 37 }; 41 38 42 39 extern struct genl_family netdev_nl_family; 40 + 41 + void netdev_nl_sock_priv_init(struct list_head *priv); 42 + void netdev_nl_sock_priv_destroy(struct list_head *priv); 43 43 44 44 #endif /* _LINUX_NETDEV_GEN_H */

+135 -4

net/core/netdev-genl.c

··· 3 3 #include <linux/netdevice.h> 4 4 #include <linux/notifier.h> 5 5 #include <linux/rtnetlink.h> 6 + #include <net/busy_poll.h> 6 7 #include <net/net_namespace.h> 8 + #include <net/netdev_queues.h> 9 + #include <net/netdev_rx_queue.h> 7 10 #include <net/sock.h> 8 11 #include <net/xdp.h> 9 12 #include <net/xdp_sock.h> 10 - #include <net/netdev_rx_queue.h> 11 - #include <net/netdev_queues.h> 12 - #include <net/busy_poll.h> 13 13 14 - #include "netdev-genl-gen.h" 15 14 #include "dev.h" 15 + #include "devmem.h" 16 + #include "netdev-genl-gen.h" 16 17 17 18 struct netdev_nl_dump_ctx { 18 19 unsigned long ifindex; ··· 295 294 netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev, 296 295 u32 q_idx, u32 q_type, const struct genl_info *info) 297 296 { 297 + struct net_devmem_dmabuf_binding *binding; 298 298 struct netdev_rx_queue *rxq; 299 299 struct netdev_queue *txq; 300 300 void *hdr; ··· 315 313 if (rxq->napi && nla_put_u32(rsp, NETDEV_A_QUEUE_NAPI_ID, 316 314 rxq->napi->napi_id)) 317 315 goto nla_put_failure; 316 + 317 + binding = rxq->mp_params.mp_priv; 318 + if (binding && 319 + nla_put_u32(rsp, NETDEV_A_QUEUE_DMABUF, binding->id)) 320 + goto nla_put_failure; 321 + 318 322 break; 319 323 case NETDEV_QUEUE_TYPE_TX: 320 324 txq = netdev_get_tx_queue(netdev, q_idx); ··· 729 721 rtnl_unlock(); 730 722 731 723 return err; 724 + } 725 + 726 + int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info) 727 + { 728 + struct nlattr *tb[ARRAY_SIZE(netdev_queue_id_nl_policy)]; 729 + struct net_devmem_dmabuf_binding *binding; 730 + struct list_head *sock_binding_list; 731 + u32 ifindex, dmabuf_fd, rxq_idx; 732 + struct net_device *netdev; 733 + struct sk_buff *rsp; 734 + struct nlattr *attr; 735 + int rem, err = 0; 736 + void *hdr; 737 + 738 + if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_DEV_IFINDEX) || 739 + GENL_REQ_ATTR_CHECK(info, NETDEV_A_DMABUF_FD) || 740 + GENL_REQ_ATTR_CHECK(info, NETDEV_A_DMABUF_QUEUES)) 741 + return -EINVAL; 742 + 743 + ifindex = nla_get_u32(info->attrs[NETDEV_A_DEV_IFINDEX]); 744 + dmabuf_fd = nla_get_u32(info->attrs[NETDEV_A_DMABUF_FD]); 745 + 746 + sock_binding_list = genl_sk_priv_get(&netdev_nl_family, 747 + NETLINK_CB(skb).sk); 748 + if (IS_ERR(sock_binding_list)) 749 + return PTR_ERR(sock_binding_list); 750 + 751 + rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); 752 + if (!rsp) 753 + return -ENOMEM; 754 + 755 + hdr = genlmsg_iput(rsp, info); 756 + if (!hdr) { 757 + err = -EMSGSIZE; 758 + goto err_genlmsg_free; 759 + } 760 + 761 + rtnl_lock(); 762 + 763 + netdev = __dev_get_by_index(genl_info_net(info), ifindex); 764 + if (!netdev || !netif_device_present(netdev)) { 765 + err = -ENODEV; 766 + goto err_unlock; 767 + } 768 + 769 + if (dev_xdp_prog_count(netdev)) { 770 + NL_SET_ERR_MSG(info->extack, "unable to bind dmabuf to device with XDP program attached"); 771 + err = -EEXIST; 772 + goto err_unlock; 773 + } 774 + 775 + binding = net_devmem_bind_dmabuf(netdev, dmabuf_fd, info->extack); 776 + if (IS_ERR(binding)) { 777 + err = PTR_ERR(binding); 778 + goto err_unlock; 779 + } 780 + 781 + nla_for_each_attr_type(attr, NETDEV_A_DMABUF_QUEUES, 782 + genlmsg_data(info->genlhdr), 783 + genlmsg_len(info->genlhdr), rem) { 784 + err = nla_parse_nested( 785 + tb, ARRAY_SIZE(netdev_queue_id_nl_policy) - 1, attr, 786 + netdev_queue_id_nl_policy, info->extack); 787 + if (err < 0) 788 + goto err_unbind; 789 + 790 + if (NL_REQ_ATTR_CHECK(info->extack, attr, tb, NETDEV_A_QUEUE_ID) || 791 + NL_REQ_ATTR_CHECK(info->extack, attr, tb, NETDEV_A_QUEUE_TYPE)) { 792 + err = -EINVAL; 793 + goto err_unbind; 794 + } 795 + 796 + if (nla_get_u32(tb[NETDEV_A_QUEUE_TYPE]) != NETDEV_QUEUE_TYPE_RX) { 797 + NL_SET_BAD_ATTR(info->extack, tb[NETDEV_A_QUEUE_TYPE]); 798 + err = -EINVAL; 799 + goto err_unbind; 800 + } 801 + 802 + rxq_idx = nla_get_u32(tb[NETDEV_A_QUEUE_ID]); 803 + 804 + err = net_devmem_bind_dmabuf_to_queue(netdev, rxq_idx, binding, 805 + info->extack); 806 + if (err) 807 + goto err_unbind; 808 + } 809 + 810 + list_add(&binding->list, sock_binding_list); 811 + 812 + nla_put_u32(rsp, NETDEV_A_DMABUF_ID, binding->id); 813 + genlmsg_end(rsp, hdr); 814 + 815 + err = genlmsg_reply(rsp, info); 816 + if (err) 817 + goto err_unbind; 818 + 819 + rtnl_unlock(); 820 + 821 + return 0; 822 + 823 + err_unbind: 824 + net_devmem_unbind_dmabuf(binding); 825 + err_unlock: 826 + rtnl_unlock(); 827 + err_genlmsg_free: 828 + nlmsg_free(rsp); 829 + return err; 830 + } 831 + 832 + void netdev_nl_sock_priv_init(struct list_head *priv) 833 + { 834 + INIT_LIST_HEAD(priv); 835 + } 836 + 837 + void netdev_nl_sock_priv_destroy(struct list_head *priv) 838 + { 839 + struct net_devmem_dmabuf_binding *binding; 840 + struct net_devmem_dmabuf_binding *temp; 841 + 842 + list_for_each_entry_safe(binding, temp, priv, list) { 843 + rtnl_lock(); 844 + net_devmem_unbind_dmabuf(binding); 845 + rtnl_unlock(); 846 + } 732 847 } 733 848 734 849 static int netdev_genl_netdevice_event(struct notifier_block *nb,

+81

net/core/netdev_rx_queue.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-or-later 2 + 3 + #include <linux/netdevice.h> 4 + #include <net/netdev_queues.h> 5 + #include <net/netdev_rx_queue.h> 6 + 7 + #include "page_pool_priv.h" 8 + 9 + int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq_idx) 10 + { 11 + struct netdev_rx_queue *rxq = __netif_get_rx_queue(dev, rxq_idx); 12 + void *new_mem, *old_mem; 13 + int err; 14 + 15 + if (!dev->queue_mgmt_ops || !dev->queue_mgmt_ops->ndo_queue_stop || 16 + !dev->queue_mgmt_ops->ndo_queue_mem_free || 17 + !dev->queue_mgmt_ops->ndo_queue_mem_alloc || 18 + !dev->queue_mgmt_ops->ndo_queue_start) 19 + return -EOPNOTSUPP; 20 + 21 + ASSERT_RTNL(); 22 + 23 + new_mem = kvzalloc(dev->queue_mgmt_ops->ndo_queue_mem_size, GFP_KERNEL); 24 + if (!new_mem) 25 + return -ENOMEM; 26 + 27 + old_mem = kvzalloc(dev->queue_mgmt_ops->ndo_queue_mem_size, GFP_KERNEL); 28 + if (!old_mem) { 29 + err = -ENOMEM; 30 + goto err_free_new_mem; 31 + } 32 + 33 + err = dev->queue_mgmt_ops->ndo_queue_mem_alloc(dev, new_mem, rxq_idx); 34 + if (err) 35 + goto err_free_old_mem; 36 + 37 + err = page_pool_check_memory_provider(dev, rxq); 38 + if (err) 39 + goto err_free_new_queue_mem; 40 + 41 + err = dev->queue_mgmt_ops->ndo_queue_stop(dev, old_mem, rxq_idx); 42 + if (err) 43 + goto err_free_new_queue_mem; 44 + 45 + err = dev->queue_mgmt_ops->ndo_queue_start(dev, new_mem, rxq_idx); 46 + if (err) 47 + goto err_start_queue; 48 + 49 + dev->queue_mgmt_ops->ndo_queue_mem_free(dev, old_mem); 50 + 51 + kvfree(old_mem); 52 + kvfree(new_mem); 53 + 54 + return 0; 55 + 56 + err_start_queue: 57 + /* Restarting the queue with old_mem should be successful as we haven't 58 + * changed any of the queue configuration, and there is not much we can 59 + * do to recover from a failure here. 60 + * 61 + * WARN if we fail to recover the old rx queue, and at least free 62 + * old_mem so we don't also leak that. 63 + */ 64 + if (dev->queue_mgmt_ops->ndo_queue_start(dev, old_mem, rxq_idx)) { 65 + WARN(1, 66 + "Failed to restart old queue in error path. RX queue %d may be unhealthy.", 67 + rxq_idx); 68 + dev->queue_mgmt_ops->ndo_queue_mem_free(dev, old_mem); 69 + } 70 + 71 + err_free_new_queue_mem: 72 + dev->queue_mgmt_ops->ndo_queue_mem_free(dev, new_mem); 73 + 74 + err_free_old_mem: 75 + kvfree(old_mem); 76 + 77 + err_free_new_mem: 78 + kvfree(new_mem); 79 + 80 + return err; 81 + }

+31

net/core/netmem_priv.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + 3 + #ifndef __NETMEM_PRIV_H 4 + #define __NETMEM_PRIV_H 5 + 6 + static inline unsigned long netmem_get_pp_magic(netmem_ref netmem) 7 + { 8 + return __netmem_clear_lsb(netmem)->pp_magic; 9 + } 10 + 11 + static inline void netmem_or_pp_magic(netmem_ref netmem, unsigned long pp_magic) 12 + { 13 + __netmem_clear_lsb(netmem)->pp_magic |= pp_magic; 14 + } 15 + 16 + static inline void netmem_clear_pp_magic(netmem_ref netmem) 17 + { 18 + __netmem_clear_lsb(netmem)->pp_magic = 0; 19 + } 20 + 21 + static inline void netmem_set_pp(netmem_ref netmem, struct page_pool *pool) 22 + { 23 + __netmem_clear_lsb(netmem)->pp = pool; 24 + } 25 + 26 + static inline void netmem_set_dma_addr(netmem_ref netmem, 27 + unsigned long dma_addr) 28 + { 29 + __netmem_clear_lsb(netmem)->dma_addr = dma_addr; 30 + } 31 + #endif

+85 -34

net/core/page_pool.c

··· 11 11 #include <linux/slab.h> 12 12 #include <linux/device.h> 13 13 14 + #include <net/netdev_rx_queue.h> 14 15 #include <net/page_pool/helpers.h> 15 16 #include <net/xdp.h> 16 17 ··· 25 24 26 25 #include <trace/events/page_pool.h> 27 26 27 + #include "mp_dmabuf_devmem.h" 28 + #include "netmem_priv.h" 28 29 #include "page_pool_priv.h" 30 + 31 + DEFINE_STATIC_KEY_FALSE(page_pool_mem_providers); 29 32 30 33 #define DEFER_TIME (msecs_to_jiffies(1000)) 31 34 #define DEFER_WARN_INTERVAL (60 * HZ) ··· 192 187 int cpuid) 193 188 { 194 189 unsigned int ring_qsize = 1024; /* Default */ 190 + struct netdev_rx_queue *rxq; 191 + int err; 195 192 196 193 page_pool_struct_check(); 197 194 ··· 275 268 if (pool->dma_map) 276 269 get_device(pool->p.dev); 277 270 271 + if (pool->slow.flags & PP_FLAG_ALLOW_UNREADABLE_NETMEM) { 272 + /* We rely on rtnl_lock()ing to make sure netdev_rx_queue 273 + * configuration doesn't change while we're initializing 274 + * the page_pool. 275 + */ 276 + ASSERT_RTNL(); 277 + rxq = __netif_get_rx_queue(pool->slow.netdev, 278 + pool->slow.queue_idx); 279 + pool->mp_priv = rxq->mp_params.mp_priv; 280 + } 281 + 282 + if (pool->mp_priv) { 283 + err = mp_dmabuf_devmem_init(pool); 284 + if (err) { 285 + pr_warn("%s() mem-provider init failed %d\n", __func__, 286 + err); 287 + goto free_ptr_ring; 288 + } 289 + 290 + static_branch_inc(&page_pool_mem_providers); 291 + } 292 + 278 293 return 0; 294 + 295 + free_ptr_ring: 296 + ptr_ring_cleanup(&pool->ring, NULL); 297 + #ifdef CONFIG_PAGE_POOL_STATS 298 + if (!pool->system) 299 + free_percpu(pool->recycle_stats); 300 + #endif 301 + return err; 279 302 } 280 303 281 304 static void page_pool_uninit(struct page_pool *pool) ··· 395 358 if (unlikely(!netmem)) 396 359 break; 397 360 398 - if (likely(page_to_nid(netmem_to_page(netmem)) == pref_nid)) { 361 + if (likely(netmem_is_pref_nid(netmem, pref_nid))) { 399 362 pool->alloc.cache[pool->alloc.count++] = netmem; 400 363 } else { 401 364 /* NUMA mismatch; ··· 487 450 PAGE_SIZE << pool->p.order, pool->p.dma_dir, 488 451 DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING); 489 452 return false; 490 - } 491 - 492 - static void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem) 493 - { 494 - struct page *page = netmem_to_page(netmem); 495 - 496 - page->pp = pool; 497 - page->pp_magic |= PP_SIGNATURE; 498 - 499 - /* Ensuring all pages have been split into one fragment initially: 500 - * page_pool_set_pp_info() is only called once for every page when it 501 - * is allocated from the page allocator and page_pool_fragment_page() 502 - * is dirtying the same cache line as the page->pp_magic above, so 503 - * the overhead is negligible. 504 - */ 505 - page_pool_fragment_netmem(netmem, 1); 506 - if (pool->has_init_callback) 507 - pool->slow.init_callback(netmem, pool->slow.init_arg); 508 - } 509 - 510 - static void page_pool_clear_pp_info(netmem_ref netmem) 511 - { 512 - struct page *page = netmem_to_page(netmem); 513 - 514 - page->pp_magic = 0; 515 - page->pp = NULL; 516 453 } 517 454 518 455 static struct page *__page_pool_alloc_page_order(struct page_pool *pool, ··· 584 573 return netmem; 585 574 586 575 /* Slow-path: cache empty, do real allocation */ 587 - netmem = __page_pool_alloc_pages_slow(pool, gfp); 576 + if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_priv) 577 + netmem = mp_dmabuf_devmem_alloc_netmems(pool, gfp); 578 + else 579 + netmem = __page_pool_alloc_pages_slow(pool, gfp); 588 580 return netmem; 589 581 } 590 582 EXPORT_SYMBOL(page_pool_alloc_netmem); ··· 623 609 return inflight; 624 610 } 625 611 612 + void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem) 613 + { 614 + netmem_set_pp(netmem, pool); 615 + netmem_or_pp_magic(netmem, PP_SIGNATURE); 616 + 617 + /* Ensuring all pages have been split into one fragment initially: 618 + * page_pool_set_pp_info() is only called once for every page when it 619 + * is allocated from the page allocator and page_pool_fragment_page() 620 + * is dirtying the same cache line as the page->pp_magic above, so 621 + * the overhead is negligible. 622 + */ 623 + page_pool_fragment_netmem(netmem, 1); 624 + if (pool->has_init_callback) 625 + pool->slow.init_callback(netmem, pool->slow.init_arg); 626 + } 627 + 628 + void page_pool_clear_pp_info(netmem_ref netmem) 629 + { 630 + netmem_clear_pp_magic(netmem); 631 + netmem_set_pp(netmem, NULL); 632 + } 633 + 626 634 static __always_inline void __page_pool_release_page_dma(struct page_pool *pool, 627 635 netmem_ref netmem) 628 636 { ··· 673 637 void page_pool_return_page(struct page_pool *pool, netmem_ref netmem) 674 638 { 675 639 int count; 640 + bool put; 676 641 677 - __page_pool_release_page_dma(pool, netmem); 642 + put = true; 643 + if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_priv) 644 + put = mp_dmabuf_devmem_release_page(pool, netmem); 645 + else 646 + __page_pool_release_page_dma(pool, netmem); 678 647 679 648 /* This may be the last page returned, releasing the pool, so 680 649 * it is not safe to reference pool afterwards. ··· 687 646 count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt); 688 647 trace_page_pool_state_release(pool, netmem, count); 689 648 690 - page_pool_clear_pp_info(netmem); 691 - put_page(netmem_to_page(netmem)); 649 + if (put) { 650 + page_pool_clear_pp_info(netmem); 651 + put_page(netmem_to_page(netmem)); 652 + } 692 653 /* An optimization would be to call __free_pages(page, pool->p.order) 693 654 * knowing page is not part of page-cache (thus avoiding a 694 655 * __page_cache_release() call). ··· 735 692 736 693 static bool __page_pool_page_can_be_recycled(netmem_ref netmem) 737 694 { 738 - return page_ref_count(netmem_to_page(netmem)) == 1 && 739 - !page_is_pfmemalloc(netmem_to_page(netmem)); 695 + return netmem_is_net_iov(netmem) || 696 + (page_ref_count(netmem_to_page(netmem)) == 1 && 697 + !page_is_pfmemalloc(netmem_to_page(netmem))); 740 698 } 741 699 742 700 /* If the page refcnt == 1, this will try to recycle the page. ··· 772 728 /* Page found as candidate for recycling */ 773 729 return netmem; 774 730 } 731 + 775 732 /* Fallback/non-XDP mode: API user have elevated refcnt. 776 733 * 777 734 * Many drivers split up the page into fragments, and some ··· 994 949 /* Empty recycle ring */ 995 950 while ((netmem = (__force netmem_ref)ptr_ring_consume_bh(&pool->ring))) { 996 951 /* Verify the refcnt invariant of cached pages */ 997 - if (!(page_ref_count(netmem_to_page(netmem)) == 1)) 952 + if (!(netmem_ref_count(netmem) == 1)) 998 953 pr_crit("%s() page_pool refcnt %d violation\n", 999 954 __func__, netmem_ref_count(netmem)); 1000 955 ··· 1009 964 1010 965 page_pool_unlist(pool); 1011 966 page_pool_uninit(pool); 967 + 968 + if (pool->mp_priv) { 969 + mp_dmabuf_devmem_destroy(pool); 970 + static_branch_dec(&page_pool_mem_providers); 971 + } 972 + 1012 973 kfree(pool); 1013 974 } 1014 975

+46

net/core/page_pool_priv.h

··· 3 3 #ifndef __PAGE_POOL_PRIV_H 4 4 #define __PAGE_POOL_PRIV_H 5 5 6 + #include <net/page_pool/helpers.h> 7 + 8 + #include "netmem_priv.h" 9 + 6 10 s32 page_pool_inflight(const struct page_pool *pool, bool strict); 7 11 8 12 int page_pool_list(struct page_pool *pool); 9 13 void page_pool_detached(struct page_pool *pool); 10 14 void page_pool_unlist(struct page_pool *pool); 15 + 16 + static inline bool 17 + page_pool_set_dma_addr_netmem(netmem_ref netmem, dma_addr_t addr) 18 + { 19 + if (PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA) { 20 + netmem_set_dma_addr(netmem, addr >> PAGE_SHIFT); 21 + 22 + /* We assume page alignment to shave off bottom bits, 23 + * if this "compression" doesn't work we need to drop. 24 + */ 25 + return addr != (dma_addr_t)netmem_get_dma_addr(netmem) 26 + << PAGE_SHIFT; 27 + } 28 + 29 + netmem_set_dma_addr(netmem, addr); 30 + return false; 31 + } 32 + 33 + static inline bool page_pool_set_dma_addr(struct page *page, dma_addr_t addr) 34 + { 35 + return page_pool_set_dma_addr_netmem(page_to_netmem(page), addr); 36 + } 37 + 38 + #if defined(CONFIG_PAGE_POOL) 39 + void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem); 40 + void page_pool_clear_pp_info(netmem_ref netmem); 41 + int page_pool_check_memory_provider(struct net_device *dev, 42 + struct netdev_rx_queue *rxq); 43 + #else 44 + static inline void page_pool_set_pp_info(struct page_pool *pool, 45 + netmem_ref netmem) 46 + { 47 + } 48 + static inline void page_pool_clear_pp_info(netmem_ref netmem) 49 + { 50 + } 51 + static inline int page_pool_check_memory_provider(struct net_device *dev, 52 + struct netdev_rx_queue *rxq) 53 + { 54 + return 0; 55 + } 56 + #endif 11 57 12 58 #endif

+31 -1

net/core/page_pool_user.c

··· 4 4 #include <linux/netdevice.h> 5 5 #include <linux/xarray.h> 6 6 #include <net/net_debug.h> 7 - #include <net/page_pool/types.h> 7 + #include <net/netdev_rx_queue.h> 8 8 #include <net/page_pool/helpers.h> 9 + #include <net/page_pool/types.h> 9 10 #include <net/sock.h> 10 11 12 + #include "devmem.h" 11 13 #include "page_pool_priv.h" 12 14 #include "netdev-genl-gen.h" 13 15 ··· 214 212 page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool, 215 213 const struct genl_info *info) 216 214 { 215 + struct net_devmem_dmabuf_binding *binding = pool->mp_priv; 217 216 size_t inflight, refsz; 218 217 void *hdr; 219 218 ··· 242 239 if (pool->user.detach_time && 243 240 nla_put_uint(rsp, NETDEV_A_PAGE_POOL_DETACH_TIME, 244 241 pool->user.detach_time)) 242 + goto err_cancel; 243 + 244 + if (binding && nla_put_u32(rsp, NETDEV_A_PAGE_POOL_DMABUF, binding->id)) 245 245 goto err_cancel; 246 246 247 247 genlmsg_end(rsp, hdr); ··· 348 342 if (!hlist_unhashed(&pool->user.list)) 349 343 hlist_del(&pool->user.list); 350 344 mutex_unlock(&page_pools_lock); 345 + } 346 + 347 + int page_pool_check_memory_provider(struct net_device *dev, 348 + struct netdev_rx_queue *rxq) 349 + { 350 + struct net_devmem_dmabuf_binding *binding = rxq->mp_params.mp_priv; 351 + struct page_pool *pool; 352 + struct hlist_node *n; 353 + 354 + if (!binding) 355 + return 0; 356 + 357 + mutex_lock(&page_pools_lock); 358 + hlist_for_each_entry_safe(pool, n, &dev->page_pools, user.list) { 359 + if (pool->mp_priv != binding) 360 + continue; 361 + 362 + if (pool->slow.queue_idx == get_netdev_rx_queue_index(rxq)) { 363 + mutex_unlock(&page_pools_lock); 364 + return 0; 365 + } 366 + } 367 + mutex_unlock(&page_pools_lock); 368 + return -ENODATA; 351 369 } 352 370 353 371 static void page_pool_unreg_netdev_wipe(struct net_device *netdev)

+63 -14

net/core/skbuff.c

··· 88 88 #include <linux/textsearch.h> 89 89 90 90 #include "dev.h" 91 + #include "netmem_priv.h" 91 92 #include "sock_destructor.h" 92 93 93 94 #ifdef CONFIG_SKB_EXTENSIONS ··· 921 920 skb_get(list); 922 921 } 923 922 924 - static bool is_pp_page(struct page *page) 923 + static bool is_pp_netmem(netmem_ref netmem) 925 924 { 926 - return (page->pp_magic & ~0x3UL) == PP_SIGNATURE; 925 + return (netmem_get_pp_magic(netmem) & ~0x3UL) == PP_SIGNATURE; 927 926 } 928 927 929 928 int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb, ··· 1021 1020 #if IS_ENABLED(CONFIG_PAGE_POOL) 1022 1021 bool napi_pp_put_page(netmem_ref netmem) 1023 1022 { 1024 - struct page *page = netmem_to_page(netmem); 1025 - 1026 - page = compound_head(page); 1023 + netmem = netmem_compound_head(netmem); 1027 1024 1028 1025 /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation 1029 1026 * in order to preserve any existing bits, such as bit 0 for the ··· 1030 1031 * and page_is_pfmemalloc() is checked in __page_pool_put_page() 1031 1032 * to avoid recycling the pfmemalloc page. 1032 1033 */ 1033 - if (unlikely(!is_pp_page(page))) 1034 + if (unlikely(!is_pp_netmem(netmem))) 1034 1035 return false; 1035 1036 1036 - page_pool_put_full_netmem(page->pp, page_to_netmem(page), false); 1037 + page_pool_put_full_netmem(netmem_get_pp(netmem), netmem, false); 1037 1038 1038 1039 return true; 1039 1040 } ··· 1060 1061 static int skb_pp_frag_ref(struct sk_buff *skb) 1061 1062 { 1062 1063 struct skb_shared_info *shinfo; 1063 - struct page *head_page; 1064 + netmem_ref head_netmem; 1064 1065 int i; 1065 1066 1066 1067 if (!skb->pp_recycle) ··· 1069 1070 shinfo = skb_shinfo(skb); 1070 1071 1071 1072 for (i = 0; i < shinfo->nr_frags; i++) { 1072 - head_page = compound_head(skb_frag_page(&shinfo->frags[i])); 1073 - if (likely(is_pp_page(head_page))) 1074 - page_pool_ref_page(head_page); 1073 + head_netmem = netmem_compound_head(shinfo->frags[i].netmem); 1074 + if (likely(is_pp_netmem(head_netmem))) 1075 + page_pool_ref_netmem(head_netmem); 1075 1076 else 1076 - page_ref_inc(head_page); 1077 + page_ref_inc(netmem_to_page(head_netmem)); 1077 1078 } 1078 1079 return 0; 1079 1080 } ··· 1370 1371 u32 p_off, p_len, copied; 1371 1372 struct page *p; 1372 1373 u8 *vaddr; 1374 + 1375 + if (skb_frag_is_net_iov(frag)) { 1376 + printk("%sskb frag %d: not readable\n", level, i); 1377 + len -= skb_frag_size(frag); 1378 + if (!len) 1379 + break; 1380 + continue; 1381 + } 1373 1382 1374 1383 skb_frag_foreach_page(frag, skb_frag_off(frag), 1375 1384 skb_frag_size(frag), p, p_off, p_len, ··· 1972 1965 if (skb_shared(skb) || skb_unclone(skb, gfp_mask)) 1973 1966 return -EINVAL; 1974 1967 1968 + if (!skb_frags_readable(skb)) 1969 + return -EFAULT; 1970 + 1975 1971 if (!num_frags) 1976 1972 goto release; 1977 1973 ··· 2147 2137 struct sk_buff *n; 2148 2138 unsigned int size; 2149 2139 int headerlen; 2140 + 2141 + if (!skb_frags_readable(skb)) 2142 + return NULL; 2150 2143 2151 2144 if (WARN_ON_ONCE(skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST)) 2152 2145 return NULL; ··· 2488 2475 int head_copy_len, head_copy_off; 2489 2476 struct sk_buff *n; 2490 2477 int oldheadroom; 2478 + 2479 + if (!skb_frags_readable(skb)) 2480 + return NULL; 2491 2481 2492 2482 if (WARN_ON_ONCE(skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST)) 2493 2483 return NULL; ··· 2836 2820 */ 2837 2821 int i, k, eat = (skb->tail + delta) - skb->end; 2838 2822 2823 + if (!skb_frags_readable(skb)) 2824 + return NULL; 2825 + 2839 2826 if (eat > 0 || skb_cloned(skb)) { 2840 2827 if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0, 2841 2828 GFP_ATOMIC)) ··· 2991 2972 offset += copy; 2992 2973 to += copy; 2993 2974 } 2975 + 2976 + if (!skb_frags_readable(skb)) 2977 + goto fault; 2994 2978 2995 2979 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 2996 2980 int end; ··· 3183 3161 /* 3184 3162 * then map the fragments 3185 3163 */ 3164 + if (!skb_frags_readable(skb)) 3165 + return false; 3166 + 3186 3167 for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) { 3187 3168 const skb_frag_t *f = &skb_shinfo(skb)->frags[seg]; 3169 + 3170 + if (WARN_ON_ONCE(!skb_frag_page(f))) 3171 + return false; 3188 3172 3189 3173 if (__splice_segment(skb_frag_page(f), 3190 3174 skb_frag_off(f), skb_frag_size(f), ··· 3409 3381 from += copy; 3410 3382 } 3411 3383 3384 + if (!skb_frags_readable(skb)) 3385 + goto fault; 3386 + 3412 3387 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 3413 3388 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 3414 3389 int end; ··· 3490 3459 offset += copy; 3491 3460 pos = copy; 3492 3461 } 3462 + 3463 + if (WARN_ON_ONCE(!skb_frags_readable(skb))) 3464 + return 0; 3493 3465 3494 3466 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 3495 3467 int end; ··· 3593 3559 to += copy; 3594 3560 pos = copy; 3595 3561 } 3562 + 3563 + if (!skb_frags_readable(skb)) 3564 + return 0; 3596 3565 3597 3566 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 3598 3567 int end; ··· 4088 4051 skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i]; 4089 4052 4090 4053 skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags; 4054 + skb1->unreadable = skb->unreadable; 4091 4055 skb_shinfo(skb)->nr_frags = 0; 4092 4056 skb1->data_len = skb->data_len; 4093 4057 skb1->len += skb1->data_len; ··· 4136 4098 pos += size; 4137 4099 } 4138 4100 skb_shinfo(skb1)->nr_frags = k; 4101 + 4102 + skb1->unreadable = skb->unreadable; 4139 4103 } 4140 4104 4141 4105 /** ··· 4374 4334 *data = st->cur_skb->data + (abs_offset - st->stepped_offset); 4375 4335 return block_limit - abs_offset; 4376 4336 } 4337 + 4338 + if (!skb_frags_readable(st->cur_skb)) 4339 + return 0; 4377 4340 4378 4341 if (st->frag_idx == 0 && !st->frag_data) 4379 4342 st->stepped_offset += skb_headlen(st->cur_skb); ··· 6025 5982 if (to->pp_recycle != from->pp_recycle) 6026 5983 return false; 6027 5984 6028 - if (len <= skb_tailroom(to)) { 5985 + if (skb_frags_readable(from) != skb_frags_readable(to)) 5986 + return false; 5987 + 5988 + if (len <= skb_tailroom(to) && skb_frags_readable(from)) { 6029 5989 if (len) 6030 5990 BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len)); 6031 5991 *delta_truesize = 0; ··· 6204 6158 { 6205 6159 if (!pskb_may_pull(skb, write_len)) 6206 6160 return -ENOMEM; 6161 + 6162 + if (!skb_frags_readable(skb)) 6163 + return -EFAULT; 6207 6164 6208 6165 if (!skb_cloned(skb) || skb_clone_writable(skb, write_len)) 6209 6166 return 0; ··· 6887 6838 { 6888 6839 if (skb->data_len) { 6889 6840 if (skb->data_len > skb->end - skb->tail || 6890 - skb_cloned(skb)) 6841 + skb_cloned(skb) || !skb_frags_readable(skb)) 6891 6842 return; 6892 6843 6893 6844 /* Nice, we can free page frag(s) right now */

+68

net/core/sock.c

··· 124 124 #include <linux/netdevice.h> 125 125 #include <net/protocol.h> 126 126 #include <linux/skbuff.h> 127 + #include <linux/skbuff_ref.h> 127 128 #include <net/net_namespace.h> 128 129 #include <net/request_sock.h> 129 130 #include <net/sock.h> ··· 1050 1049 return 0; 1051 1050 } 1052 1051 1052 + #ifdef CONFIG_PAGE_POOL 1053 + 1054 + /* This is the number of tokens that the user can SO_DEVMEM_DONTNEED in 1055 + * 1 syscall. The limit exists to limit the amount of memory the kernel 1056 + * allocates to copy these tokens. 1057 + */ 1058 + #define MAX_DONTNEED_TOKENS 128 1059 + 1060 + static noinline_for_stack int 1061 + sock_devmem_dontneed(struct sock *sk, sockptr_t optval, unsigned int optlen) 1062 + { 1063 + unsigned int num_tokens, i, j, k, netmem_num = 0; 1064 + struct dmabuf_token *tokens; 1065 + netmem_ref netmems[16]; 1066 + int ret = 0; 1067 + 1068 + if (!sk_is_tcp(sk)) 1069 + return -EBADF; 1070 + 1071 + if (optlen % sizeof(struct dmabuf_token) || 1072 + optlen > sizeof(*tokens) * MAX_DONTNEED_TOKENS) 1073 + return -EINVAL; 1074 + 1075 + tokens = kvmalloc_array(optlen, sizeof(*tokens), GFP_KERNEL); 1076 + if (!tokens) 1077 + return -ENOMEM; 1078 + 1079 + num_tokens = optlen / sizeof(struct dmabuf_token); 1080 + if (copy_from_sockptr(tokens, optval, optlen)) { 1081 + kvfree(tokens); 1082 + return -EFAULT; 1083 + } 1084 + 1085 + xa_lock_bh(&sk->sk_user_frags); 1086 + for (i = 0; i < num_tokens; i++) { 1087 + for (j = 0; j < tokens[i].token_count; j++) { 1088 + netmem_ref netmem = (__force netmem_ref)__xa_erase( 1089 + &sk->sk_user_frags, tokens[i].token_start + j); 1090 + 1091 + if (netmem && 1092 + !WARN_ON_ONCE(!netmem_is_net_iov(netmem))) { 1093 + netmems[netmem_num++] = netmem; 1094 + if (netmem_num == ARRAY_SIZE(netmems)) { 1095 + xa_unlock_bh(&sk->sk_user_frags); 1096 + for (k = 0; k < netmem_num; k++) 1097 + WARN_ON_ONCE(!napi_pp_put_page(netmems[k])); 1098 + netmem_num = 0; 1099 + xa_lock_bh(&sk->sk_user_frags); 1100 + } 1101 + ret++; 1102 + } 1103 + } 1104 + } 1105 + 1106 + xa_unlock_bh(&sk->sk_user_frags); 1107 + for (k = 0; k < netmem_num; k++) 1108 + WARN_ON_ONCE(!napi_pp_put_page(netmems[k])); 1109 + 1110 + kvfree(tokens); 1111 + return ret; 1112 + } 1113 + #endif 1114 + 1053 1115 void sockopt_lock_sock(struct sock *sk) 1054 1116 { 1055 1117 /* When current->bpf_ctx is set, the setsockopt is called from ··· 1275 1211 ret = -EOPNOTSUPP; 1276 1212 return ret; 1277 1213 } 1214 + #ifdef CONFIG_PAGE_POOL 1215 + case SO_DEVMEM_DONTNEED: 1216 + return sock_devmem_dontneed(sk, optval, optlen); 1217 + #endif 1278 1218 } 1279 1219 1280 1220 sockopt_lock_sock(sk);

+8

net/ethtool/common.c

··· 655 655 { 656 656 u64 max_rxnfc_in_use; 657 657 u32 max_rxfh_in_use; 658 + int max_mp_in_use; 658 659 659 660 /* ensure the new Rx count fits within the configured Rx flow 660 661 * indirection table/rxnfc settings ··· 671 670 if (channels.combined_count + channels.rx_count <= max_rxnfc_in_use) { 672 671 if (info) 673 672 GENL_SET_ERR_MSG(info, "requested channel counts are too low for existing ntuple filter settings"); 673 + return -EINVAL; 674 + } 675 + 676 + max_mp_in_use = dev_get_min_mp_channel_count(dev); 677 + if (channels.combined_count + channels.rx_count <= max_mp_in_use) { 678 + if (info) 679 + GENL_SET_ERR_MSG_FMT(info, "requested channel counts are too low for existing memory provider setting (%d)", max_mp_in_use); 674 680 return -EINVAL; 675 681 } 676 682

+2 -1

net/ipv4/esp4.c

··· 115 115 */ 116 116 if (req->src != req->dst) 117 117 for (sg = sg_next(req->src); sg; sg = sg_next(sg)) 118 - skb_page_unref(sg_page(sg), skb->pp_recycle); 118 + skb_page_unref(page_to_netmem(sg_page(sg)), 119 + skb->pp_recycle); 119 120 } 120 121 121 122 #ifdef CONFIG_INET_ESPINTCP

+258 -5

net/ipv4/tcp.c

··· 285 285 #include <trace/events/tcp.h> 286 286 #include <net/rps.h> 287 287 288 + #include "../core/devmem.h" 289 + 288 290 /* Track pending CMSGs. */ 289 291 enum { 290 292 TCP_CMSG_INQ = 1, ··· 473 471 474 472 set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags); 475 473 sk_sockets_allocated_inc(sk); 474 + xa_init_flags(&sk->sk_user_frags, XA_FLAGS_ALLOC1); 476 475 } 477 476 EXPORT_SYMBOL(tcp_init_sock); 478 477 ··· 2163 2160 skb = tcp_recv_skb(sk, seq, &offset); 2164 2161 } 2165 2162 2163 + if (!skb_frags_readable(skb)) 2164 + break; 2165 + 2166 2166 if (TCP_SKB_CB(skb)->has_rxtstamp) { 2167 2167 tcp_update_recv_tstamps(skb, tss); 2168 2168 zc->msg_flags |= TCP_CMSG_TS; ··· 2183 2177 break; 2184 2178 } 2185 2179 page = skb_frag_page(frags); 2180 + if (WARN_ON_ONCE(!page)) 2181 + break; 2182 + 2186 2183 prefetchw(page); 2187 2184 pages[pages_to_map++] = page; 2188 2185 length += PAGE_SIZE; ··· 2331 2322 return inq; 2332 2323 } 2333 2324 2325 + /* batch __xa_alloc() calls and reduce xa_lock()/xa_unlock() overhead. */ 2326 + struct tcp_xa_pool { 2327 + u8 max; /* max <= MAX_SKB_FRAGS */ 2328 + u8 idx; /* idx <= max */ 2329 + __u32 tokens[MAX_SKB_FRAGS]; 2330 + netmem_ref netmems[MAX_SKB_FRAGS]; 2331 + }; 2332 + 2333 + static void tcp_xa_pool_commit_locked(struct sock *sk, struct tcp_xa_pool *p) 2334 + { 2335 + int i; 2336 + 2337 + /* Commit part that has been copied to user space. */ 2338 + for (i = 0; i < p->idx; i++) 2339 + __xa_cmpxchg(&sk->sk_user_frags, p->tokens[i], XA_ZERO_ENTRY, 2340 + (__force void *)p->netmems[i], GFP_KERNEL); 2341 + /* Rollback what has been pre-allocated and is no longer needed. */ 2342 + for (; i < p->max; i++) 2343 + __xa_erase(&sk->sk_user_frags, p->tokens[i]); 2344 + 2345 + p->max = 0; 2346 + p->idx = 0; 2347 + } 2348 + 2349 + static void tcp_xa_pool_commit(struct sock *sk, struct tcp_xa_pool *p) 2350 + { 2351 + if (!p->max) 2352 + return; 2353 + 2354 + xa_lock_bh(&sk->sk_user_frags); 2355 + 2356 + tcp_xa_pool_commit_locked(sk, p); 2357 + 2358 + xa_unlock_bh(&sk->sk_user_frags); 2359 + } 2360 + 2361 + static int tcp_xa_pool_refill(struct sock *sk, struct tcp_xa_pool *p, 2362 + unsigned int max_frags) 2363 + { 2364 + int err, k; 2365 + 2366 + if (p->idx < p->max) 2367 + return 0; 2368 + 2369 + xa_lock_bh(&sk->sk_user_frags); 2370 + 2371 + tcp_xa_pool_commit_locked(sk, p); 2372 + 2373 + for (k = 0; k < max_frags; k++) { 2374 + err = __xa_alloc(&sk->sk_user_frags, &p->tokens[k], 2375 + XA_ZERO_ENTRY, xa_limit_31b, GFP_KERNEL); 2376 + if (err) 2377 + break; 2378 + } 2379 + 2380 + xa_unlock_bh(&sk->sk_user_frags); 2381 + 2382 + p->max = k; 2383 + p->idx = 0; 2384 + return k ? 0 : err; 2385 + } 2386 + 2387 + /* On error, returns the -errno. On success, returns number of bytes sent to the 2388 + * user. May not consume all of @remaining_len. 2389 + */ 2390 + static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb, 2391 + unsigned int offset, struct msghdr *msg, 2392 + int remaining_len) 2393 + { 2394 + struct dmabuf_cmsg dmabuf_cmsg = { 0 }; 2395 + struct tcp_xa_pool tcp_xa_pool; 2396 + unsigned int start; 2397 + int i, copy, n; 2398 + int sent = 0; 2399 + int err = 0; 2400 + 2401 + tcp_xa_pool.max = 0; 2402 + tcp_xa_pool.idx = 0; 2403 + do { 2404 + start = skb_headlen(skb); 2405 + 2406 + if (skb_frags_readable(skb)) { 2407 + err = -ENODEV; 2408 + goto out; 2409 + } 2410 + 2411 + /* Copy header. */ 2412 + copy = start - offset; 2413 + if (copy > 0) { 2414 + copy = min(copy, remaining_len); 2415 + 2416 + n = copy_to_iter(skb->data + offset, copy, 2417 + &msg->msg_iter); 2418 + if (n != copy) { 2419 + err = -EFAULT; 2420 + goto out; 2421 + } 2422 + 2423 + offset += copy; 2424 + remaining_len -= copy; 2425 + 2426 + /* First a dmabuf_cmsg for # bytes copied to user 2427 + * buffer. 2428 + */ 2429 + memset(&dmabuf_cmsg, 0, sizeof(dmabuf_cmsg)); 2430 + dmabuf_cmsg.frag_size = copy; 2431 + err = put_cmsg(msg, SOL_SOCKET, SO_DEVMEM_LINEAR, 2432 + sizeof(dmabuf_cmsg), &dmabuf_cmsg); 2433 + if (err || msg->msg_flags & MSG_CTRUNC) { 2434 + msg->msg_flags &= ~MSG_CTRUNC; 2435 + if (!err) 2436 + err = -ETOOSMALL; 2437 + goto out; 2438 + } 2439 + 2440 + sent += copy; 2441 + 2442 + if (remaining_len == 0) 2443 + goto out; 2444 + } 2445 + 2446 + /* after that, send information of dmabuf pages through a 2447 + * sequence of cmsg 2448 + */ 2449 + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 2450 + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 2451 + struct net_iov *niov; 2452 + u64 frag_offset; 2453 + int end; 2454 + 2455 + /* !skb_frags_readable() should indicate that ALL the 2456 + * frags in this skb are dmabuf net_iovs. We're checking 2457 + * for that flag above, but also check individual frags 2458 + * here. If the tcp stack is not setting 2459 + * skb_frags_readable() correctly, we still don't want 2460 + * to crash here. 2461 + */ 2462 + if (!skb_frag_net_iov(frag)) { 2463 + net_err_ratelimited("Found non-dmabuf skb with net_iov"); 2464 + err = -ENODEV; 2465 + goto out; 2466 + } 2467 + 2468 + niov = skb_frag_net_iov(frag); 2469 + end = start + skb_frag_size(frag); 2470 + copy = end - offset; 2471 + 2472 + if (copy > 0) { 2473 + copy = min(copy, remaining_len); 2474 + 2475 + frag_offset = net_iov_virtual_addr(niov) + 2476 + skb_frag_off(frag) + offset - 2477 + start; 2478 + dmabuf_cmsg.frag_offset = frag_offset; 2479 + dmabuf_cmsg.frag_size = copy; 2480 + err = tcp_xa_pool_refill(sk, &tcp_xa_pool, 2481 + skb_shinfo(skb)->nr_frags - i); 2482 + if (err) 2483 + goto out; 2484 + 2485 + /* Will perform the exchange later */ 2486 + dmabuf_cmsg.frag_token = tcp_xa_pool.tokens[tcp_xa_pool.idx]; 2487 + dmabuf_cmsg.dmabuf_id = net_iov_binding_id(niov); 2488 + 2489 + offset += copy; 2490 + remaining_len -= copy; 2491 + 2492 + err = put_cmsg(msg, SOL_SOCKET, 2493 + SO_DEVMEM_DMABUF, 2494 + sizeof(dmabuf_cmsg), 2495 + &dmabuf_cmsg); 2496 + if (err || msg->msg_flags & MSG_CTRUNC) { 2497 + msg->msg_flags &= ~MSG_CTRUNC; 2498 + if (!err) 2499 + err = -ETOOSMALL; 2500 + goto out; 2501 + } 2502 + 2503 + atomic_long_inc(&niov->pp_ref_count); 2504 + tcp_xa_pool.netmems[tcp_xa_pool.idx++] = skb_frag_netmem(frag); 2505 + 2506 + sent += copy; 2507 + 2508 + if (remaining_len == 0) 2509 + goto out; 2510 + } 2511 + start = end; 2512 + } 2513 + 2514 + tcp_xa_pool_commit(sk, &tcp_xa_pool); 2515 + if (!remaining_len) 2516 + goto out; 2517 + 2518 + /* if remaining_len is not satisfied yet, we need to go to the 2519 + * next frag in the frag_list to satisfy remaining_len. 2520 + */ 2521 + skb = skb_shinfo(skb)->frag_list ?: skb->next; 2522 + 2523 + offset = offset - start; 2524 + } while (skb); 2525 + 2526 + if (remaining_len) { 2527 + err = -EFAULT; 2528 + goto out; 2529 + } 2530 + 2531 + out: 2532 + tcp_xa_pool_commit(sk, &tcp_xa_pool); 2533 + if (!sent) 2534 + sent = err; 2535 + 2536 + return sent; 2537 + } 2538 + 2334 2539 /* 2335 2540 * This routine copies from a sock struct into the user buffer. 2336 2541 * ··· 2558 2335 int *cmsg_flags) 2559 2336 { 2560 2337 struct tcp_sock *tp = tcp_sk(sk); 2338 + int last_copied_dmabuf = -1; /* uninitialized */ 2561 2339 int copied = 0; 2562 2340 u32 peek_seq; 2563 2341 u32 *seq; ··· 2738 2514 } 2739 2515 2740 2516 if (!(flags & MSG_TRUNC)) { 2741 - err = skb_copy_datagram_msg(skb, offset, msg, used); 2742 - if (err) { 2743 - /* Exception. Bailout! */ 2744 - if (!copied) 2745 - copied = -EFAULT; 2517 + if (last_copied_dmabuf != -1 && 2518 + last_copied_dmabuf != !skb_frags_readable(skb)) 2746 2519 break; 2520 + 2521 + if (skb_frags_readable(skb)) { 2522 + err = skb_copy_datagram_msg(skb, offset, msg, 2523 + used); 2524 + if (err) { 2525 + /* Exception. Bailout! */ 2526 + if (!copied) 2527 + copied = -EFAULT; 2528 + break; 2529 + } 2530 + } else { 2531 + if (!(flags & MSG_SOCK_DEVMEM)) { 2532 + /* dmabuf skbs can only be received 2533 + * with the MSG_SOCK_DEVMEM flag. 2534 + */ 2535 + if (!copied) 2536 + copied = -EFAULT; 2537 + 2538 + break; 2539 + } 2540 + 2541 + err = tcp_recvmsg_dmabuf(sk, skb, offset, msg, 2542 + used); 2543 + if (err <= 0) { 2544 + if (!copied) 2545 + copied = -EFAULT; 2546 + 2547 + break; 2548 + } 2549 + used = err; 2747 2550 } 2748 2551 } 2552 + 2553 + last_copied_dmabuf = !skb_frags_readable(skb); 2749 2554 2750 2555 WRITE_ONCE(*seq, *seq + used); 2751 2556 copied += used;

+10 -3

net/ipv4/tcp_input.c

··· 5391 5391 for (end_of_skbs = true; skb != NULL && skb != tail; skb = n) { 5392 5392 n = tcp_skb_next(skb, list); 5393 5393 5394 + if (!skb_frags_readable(skb)) 5395 + goto skip_this; 5396 + 5394 5397 /* No new bits? It is possible on ofo queue. */ 5395 5398 if (!before(start, TCP_SKB_CB(skb)->end_seq)) { 5396 5399 skb = tcp_collapse_one(sk, skb, list, root); ··· 5414 5411 break; 5415 5412 } 5416 5413 5417 - if (n && n != tail && tcp_skb_can_collapse_rx(skb, n) && 5414 + if (n && n != tail && skb_frags_readable(n) && 5415 + tcp_skb_can_collapse_rx(skb, n) && 5418 5416 TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) { 5419 5417 end_of_skbs = false; 5420 5418 break; 5421 5419 } 5422 5420 5421 + skip_this: 5423 5422 /* Decided to skip this, advance start seq. */ 5424 5423 start = TCP_SKB_CB(skb)->end_seq; 5425 5424 } 5426 5425 if (end_of_skbs || 5427 - (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN))) 5426 + (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) || 5427 + !skb_frags_readable(skb)) 5428 5428 return; 5429 5429 5430 5430 __skb_queue_head_init(&tmp); ··· 5469 5463 if (!skb || 5470 5464 skb == tail || 5471 5465 !tcp_skb_can_collapse_rx(nskb, skb) || 5472 - (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN))) 5466 + (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) || 5467 + !skb_frags_readable(skb)) 5473 5468 goto end; 5474 5469 } 5475 5470 }

+16

net/ipv4/tcp_ipv4.c

··· 79 79 #include <linux/seq_file.h> 80 80 #include <linux/inetdevice.h> 81 81 #include <linux/btf_ids.h> 82 + #include <linux/skbuff_ref.h> 82 83 83 84 #include <crypto/hash.h> 84 85 #include <linux/scatterlist.h> ··· 2513 2512 } 2514 2513 #endif 2515 2514 2515 + static void tcp_release_user_frags(struct sock *sk) 2516 + { 2517 + #ifdef CONFIG_PAGE_POOL 2518 + unsigned long index; 2519 + void *netmem; 2520 + 2521 + xa_for_each(&sk->sk_user_frags, index, netmem) 2522 + WARN_ON_ONCE(!napi_pp_put_page((__force netmem_ref)netmem)); 2523 + #endif 2524 + } 2525 + 2516 2526 void tcp_v4_destroy_sock(struct sock *sk) 2517 2527 { 2518 2528 struct tcp_sock *tp = tcp_sk(sk); 2529 + 2530 + tcp_release_user_frags(sk); 2531 + 2532 + xa_destroy(&sk->sk_user_frags); 2519 2533 2520 2534 trace_tcp_destroy_sock(sk); 2521 2535

+2

net/ipv4/tcp_minisocks.c

··· 628 628 629 629 __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS); 630 630 631 + xa_init_flags(&newsk->sk_user_frags, XA_FLAGS_ALLOC1); 632 + 631 633 return newsk; 632 634 } 633 635 EXPORT_SYMBOL(tcp_create_openreq_child);

+4 -1

net/ipv4/tcp_output.c

··· 2344 2344 2345 2345 if (unlikely(TCP_SKB_CB(skb)->eor) || 2346 2346 tcp_has_tx_tstamp(skb) || 2347 - !skb_pure_zcopy_same(skb, next)) 2347 + !skb_pure_zcopy_same(skb, next) || 2348 + skb_frags_readable(skb) != skb_frags_readable(next)) 2348 2349 return false; 2349 2350 2350 2351 len -= skb->len; ··· 3264 3263 if (tcp_skb_pcount(skb) > 1) 3265 3264 return false; 3266 3265 if (skb_cloned(skb)) 3266 + return false; 3267 + if (!skb_frags_readable(skb)) 3267 3268 return false; 3268 3269 /* Some heuristics for collapsing over SACK'd could be invented */ 3269 3270 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)

+2 -1

net/ipv6/esp6.c

··· 132 132 */ 133 133 if (req->src != req->dst) 134 134 for (sg = sg_next(req->src); sg; sg = sg_next(sg)) 135 - skb_page_unref(sg_page(sg), skb->pp_recycle); 135 + skb_page_unref(page_to_netmem(sg_page(sg)), 136 + skb->pp_recycle); 136 137 } 137 138 138 139 #ifdef CONFIG_INET6_ESPINTCP

+2 -2

net/packet/af_packet.c

··· 2216 2216 } 2217 2217 } 2218 2218 2219 - snaplen = skb->len; 2219 + snaplen = skb_frags_readable(skb) ? skb->len : skb_headlen(skb); 2220 2220 2221 2221 res = run_filter(skb, sk, snaplen); 2222 2222 if (!res) ··· 2336 2336 } 2337 2337 } 2338 2338 2339 - snaplen = skb->len; 2339 + snaplen = skb_frags_readable(skb) ? skb->len : skb_headlen(skb); 2340 2340 2341 2341 res = run_filter(skb, sk, snaplen); 2342 2342 if (!res)

+5

net/xdp/xsk_buff_pool.c

··· 211 211 goto err_unreg_pool; 212 212 } 213 213 214 + if (dev_get_min_mp_channel_count(netdev)) { 215 + err = -EBUSY; 216 + goto err_unreg_pool; 217 + } 218 + 214 219 bpf.command = XDP_SETUP_XSK_POOL; 215 220 bpf.xsk.pool = pool; 216 221 bpf.xsk.queue_id = queue_id;

+13

tools/include/uapi/linux/netdev.h

··· 93 93 NETDEV_A_PAGE_POOL_INFLIGHT, 94 94 NETDEV_A_PAGE_POOL_INFLIGHT_MEM, 95 95 NETDEV_A_PAGE_POOL_DETACH_TIME, 96 + NETDEV_A_PAGE_POOL_DMABUF, 96 97 97 98 __NETDEV_A_PAGE_POOL_MAX, 98 99 NETDEV_A_PAGE_POOL_MAX = (__NETDEV_A_PAGE_POOL_MAX - 1) ··· 132 131 NETDEV_A_QUEUE_IFINDEX, 133 132 NETDEV_A_QUEUE_TYPE, 134 133 NETDEV_A_QUEUE_NAPI_ID, 134 + NETDEV_A_QUEUE_DMABUF, 135 135 136 136 __NETDEV_A_QUEUE_MAX, 137 137 NETDEV_A_QUEUE_MAX = (__NETDEV_A_QUEUE_MAX - 1) ··· 176 174 }; 177 175 178 176 enum { 177 + NETDEV_A_DMABUF_IFINDEX = 1, 178 + NETDEV_A_DMABUF_QUEUES, 179 + NETDEV_A_DMABUF_FD, 180 + NETDEV_A_DMABUF_ID, 181 + 182 + __NETDEV_A_DMABUF_MAX, 183 + NETDEV_A_DMABUF_MAX = (__NETDEV_A_DMABUF_MAX - 1) 184 + }; 185 + 186 + enum { 179 187 NETDEV_CMD_DEV_GET = 1, 180 188 NETDEV_CMD_DEV_ADD_NTF, 181 189 NETDEV_CMD_DEV_DEL_NTF, ··· 198 186 NETDEV_CMD_QUEUE_GET, 199 187 NETDEV_CMD_NAPI_GET, 200 188 NETDEV_CMD_QSTATS_GET, 189 + NETDEV_CMD_BIND_RX, 201 190 202 191 __NETDEV_CMD_MAX, 203 192 NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1)

+1

tools/net/ynl/lib/.gitignore

··· 1 1 __pycache__/ 2 + *.d

+1

tools/testing/selftests/net/.gitignore

··· 17 17 ipv6_flowlabel_mgr 18 18 log.txt 19 19 msg_zerocopy 20 + ncdevmem 20 21 nettest 21 22 psock_fanout 22 23 psock_snd

+9

tools/testing/selftests/net/Makefile

··· 97 97 TEST_PROGS += vlan_hw_filter.sh 98 98 TEST_PROGS += bpf_offload.py 99 99 100 + # YNL files, must be before "include ..lib.mk" 101 + EXTRA_CLEAN += $(OUTPUT)/libynl.a 102 + YNL_GEN_FILES := ncdevmem 103 + TEST_GEN_FILES += $(YNL_GEN_FILES) 104 + 100 105 TEST_FILES := settings 101 106 TEST_FILES += in_netns.sh lib.sh net_helper.sh setup_loopback.sh setup_veth.sh 102 107 ··· 110 105 TEST_INCLUDES := forwarding/lib.sh 111 106 112 107 include ../lib.mk 108 + 109 + # YNL build 110 + YNL_GENS := netdev 111 + include ynl.mk 113 112 114 113 $(OUTPUT)/epoll_busy_poll: LDLIBS += -lcap 115 114 $(OUTPUT)/reuseport_bpf_numa: LDLIBS += -lnuma

+570

tools/testing/selftests/net/ncdevmem.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + #define _GNU_SOURCE 3 + #define __EXPORTED_HEADERS__ 4 + 5 + #include <linux/uio.h> 6 + #include <stdio.h> 7 + #include <stdlib.h> 8 + #include <unistd.h> 9 + #include <stdbool.h> 10 + #include <string.h> 11 + #include <errno.h> 12 + #define __iovec_defined 13 + #include <fcntl.h> 14 + #include <malloc.h> 15 + #include <error.h> 16 + 17 + #include <arpa/inet.h> 18 + #include <sys/socket.h> 19 + #include <sys/mman.h> 20 + #include <sys/ioctl.h> 21 + #include <sys/syscall.h> 22 + 23 + #include <linux/memfd.h> 24 + #include <linux/dma-buf.h> 25 + #include <linux/udmabuf.h> 26 + #include <libmnl/libmnl.h> 27 + #include <linux/types.h> 28 + #include <linux/netlink.h> 29 + #include <linux/genetlink.h> 30 + #include <linux/netdev.h> 31 + #include <time.h> 32 + #include <net/if.h> 33 + 34 + #include "netdev-user.h" 35 + #include <ynl.h> 36 + 37 + #define PAGE_SHIFT 12 38 + #define TEST_PREFIX "ncdevmem" 39 + #define NUM_PAGES 16000 40 + 41 + #ifndef MSG_SOCK_DEVMEM 42 + #define MSG_SOCK_DEVMEM 0x2000000 43 + #endif 44 + 45 + /* 46 + * tcpdevmem netcat. Works similarly to netcat but does device memory TCP 47 + * instead of regular TCP. Uses udmabuf to mock a dmabuf provider. 48 + * 49 + * Usage: 50 + * 51 + * On server: 52 + * ncdevmem -s <server IP> -c <client IP> -f eth1 -l -p 5201 -v 7 53 + * 54 + * On client: 55 + * yes $(echo -e \\x01\\x02\\x03\\x04\\x05\\x06) | \ 56 + * tr \\n \\0 | \ 57 + * head -c 5G | \ 58 + * nc <server IP> 5201 -p 5201 59 + * 60 + * Note this is compatible with regular netcat. i.e. the sender or receiver can 61 + * be replaced with regular netcat to test the RX or TX path in isolation. 62 + */ 63 + 64 + static char *server_ip = "192.168.1.4"; 65 + static char *client_ip = "192.168.1.2"; 66 + static char *port = "5201"; 67 + static size_t do_validation; 68 + static int start_queue = 8; 69 + static int num_queues = 8; 70 + static char *ifname = "eth1"; 71 + static unsigned int ifindex; 72 + static unsigned int dmabuf_id; 73 + 74 + void print_bytes(void *ptr, size_t size) 75 + { 76 + unsigned char *p = ptr; 77 + int i; 78 + 79 + for (i = 0; i < size; i++) 80 + printf("%02hhX ", p[i]); 81 + printf("\n"); 82 + } 83 + 84 + void print_nonzero_bytes(void *ptr, size_t size) 85 + { 86 + unsigned char *p = ptr; 87 + unsigned int i; 88 + 89 + for (i = 0; i < size; i++) 90 + putchar(p[i]); 91 + printf("\n"); 92 + } 93 + 94 + void validate_buffer(void *line, size_t size) 95 + { 96 + static unsigned char seed = 1; 97 + unsigned char *ptr = line; 98 + int errors = 0; 99 + size_t i; 100 + 101 + for (i = 0; i < size; i++) { 102 + if (ptr[i] != seed) { 103 + fprintf(stderr, 104 + "Failed validation: expected=%u, actual=%u, index=%lu\n", 105 + seed, ptr[i], i); 106 + errors++; 107 + if (errors > 20) 108 + error(1, 0, "validation failed."); 109 + } 110 + seed++; 111 + if (seed == do_validation) 112 + seed = 0; 113 + } 114 + 115 + fprintf(stdout, "Validated buffer\n"); 116 + } 117 + 118 + #define run_command(cmd, ...) \ 119 + ({ \ 120 + char command[256]; \ 121 + memset(command, 0, sizeof(command)); \ 122 + snprintf(command, sizeof(command), cmd, ##__VA_ARGS__); \ 123 + printf("Running: %s\n", command); \ 124 + system(command); \ 125 + }) 126 + 127 + static int reset_flow_steering(void) 128 + { 129 + int ret = 0; 130 + 131 + ret = run_command("sudo ethtool -K %s ntuple off", ifname); 132 + if (ret) 133 + return ret; 134 + 135 + return run_command("sudo ethtool -K %s ntuple on", ifname); 136 + } 137 + 138 + static int configure_headersplit(bool on) 139 + { 140 + return run_command("sudo ethtool -G %s tcp-data-split %s", ifname, 141 + on ? "on" : "off"); 142 + } 143 + 144 + static int configure_rss(void) 145 + { 146 + return run_command("sudo ethtool -X %s equal %d", ifname, start_queue); 147 + } 148 + 149 + static int configure_channels(unsigned int rx, unsigned int tx) 150 + { 151 + return run_command("sudo ethtool -L %s rx %u tx %u", ifname, rx, tx); 152 + } 153 + 154 + static int configure_flow_steering(void) 155 + { 156 + return run_command("sudo ethtool -N %s flow-type tcp4 src-ip %s dst-ip %s src-port %s dst-port %s queue %d", 157 + ifname, client_ip, server_ip, port, port, start_queue); 158 + } 159 + 160 + static int bind_rx_queue(unsigned int ifindex, unsigned int dmabuf_fd, 161 + struct netdev_queue_id *queues, 162 + unsigned int n_queue_index, struct ynl_sock **ys) 163 + { 164 + struct netdev_bind_rx_req *req = NULL; 165 + struct netdev_bind_rx_rsp *rsp = NULL; 166 + struct ynl_error yerr; 167 + 168 + *ys = ynl_sock_create(&ynl_netdev_family, &yerr); 169 + if (!*ys) { 170 + fprintf(stderr, "YNL: %s\n", yerr.msg); 171 + return -1; 172 + } 173 + 174 + req = netdev_bind_rx_req_alloc(); 175 + netdev_bind_rx_req_set_ifindex(req, ifindex); 176 + netdev_bind_rx_req_set_fd(req, dmabuf_fd); 177 + __netdev_bind_rx_req_set_queues(req, queues, n_queue_index); 178 + 179 + rsp = netdev_bind_rx(*ys, req); 180 + if (!rsp) { 181 + perror("netdev_bind_rx"); 182 + goto err_close; 183 + } 184 + 185 + if (!rsp->_present.id) { 186 + perror("id not present"); 187 + goto err_close; 188 + } 189 + 190 + printf("got dmabuf id=%d\n", rsp->id); 191 + dmabuf_id = rsp->id; 192 + 193 + netdev_bind_rx_req_free(req); 194 + netdev_bind_rx_rsp_free(rsp); 195 + 196 + return 0; 197 + 198 + err_close: 199 + fprintf(stderr, "YNL failed: %s\n", (*ys)->err.msg); 200 + netdev_bind_rx_req_free(req); 201 + ynl_sock_destroy(*ys); 202 + return -1; 203 + } 204 + 205 + static void create_udmabuf(int *devfd, int *memfd, int *buf, size_t dmabuf_size) 206 + { 207 + struct udmabuf_create create; 208 + int ret; 209 + 210 + *devfd = open("/dev/udmabuf", O_RDWR); 211 + if (*devfd < 0) { 212 + error(70, 0, 213 + "%s: [skip,no-udmabuf: Unable to access DMA buffer device file]\n", 214 + TEST_PREFIX); 215 + } 216 + 217 + *memfd = memfd_create("udmabuf-test", MFD_ALLOW_SEALING); 218 + if (*memfd < 0) 219 + error(70, 0, "%s: [skip,no-memfd]\n", TEST_PREFIX); 220 + 221 + /* Required for udmabuf */ 222 + ret = fcntl(*memfd, F_ADD_SEALS, F_SEAL_SHRINK); 223 + if (ret < 0) 224 + error(73, 0, "%s: [skip,fcntl-add-seals]\n", TEST_PREFIX); 225 + 226 + ret = ftruncate(*memfd, dmabuf_size); 227 + if (ret == -1) 228 + error(74, 0, "%s: [FAIL,memfd-truncate]\n", TEST_PREFIX); 229 + 230 + memset(&create, 0, sizeof(create)); 231 + 232 + create.memfd = *memfd; 233 + create.offset = 0; 234 + create.size = dmabuf_size; 235 + *buf = ioctl(*devfd, UDMABUF_CREATE, &create); 236 + if (*buf < 0) 237 + error(75, 0, "%s: [FAIL, create udmabuf]\n", TEST_PREFIX); 238 + } 239 + 240 + int do_server(void) 241 + { 242 + char ctrl_data[sizeof(int) * 20000]; 243 + struct netdev_queue_id *queues; 244 + size_t non_page_aligned_frags = 0; 245 + struct sockaddr_in client_addr; 246 + struct sockaddr_in server_sin; 247 + size_t page_aligned_frags = 0; 248 + int devfd, memfd, buf, ret; 249 + size_t total_received = 0; 250 + socklen_t client_addr_len; 251 + bool is_devmem = false; 252 + char *buf_mem = NULL; 253 + struct ynl_sock *ys; 254 + size_t dmabuf_size; 255 + char iobuf[819200]; 256 + char buffer[256]; 257 + int socket_fd; 258 + int client_fd; 259 + size_t i = 0; 260 + int opt = 1; 261 + 262 + dmabuf_size = getpagesize() * NUM_PAGES; 263 + 264 + create_udmabuf(&devfd, &memfd, &buf, dmabuf_size); 265 + 266 + if (reset_flow_steering()) 267 + error(1, 0, "Failed to reset flow steering\n"); 268 + 269 + /* Configure RSS to divert all traffic from our devmem queues */ 270 + if (configure_rss()) 271 + error(1, 0, "Failed to configure rss\n"); 272 + 273 + /* Flow steer our devmem flows to start_queue */ 274 + if (configure_flow_steering()) 275 + error(1, 0, "Failed to configure flow steering\n"); 276 + 277 + sleep(1); 278 + 279 + queues = malloc(sizeof(*queues) * num_queues); 280 + 281 + for (i = 0; i < num_queues; i++) { 282 + queues[i]._present.type = 1; 283 + queues[i]._present.id = 1; 284 + queues[i].type = NETDEV_QUEUE_TYPE_RX; 285 + queues[i].id = start_queue + i; 286 + } 287 + 288 + if (bind_rx_queue(ifindex, buf, queues, num_queues, &ys)) 289 + error(1, 0, "Failed to bind\n"); 290 + 291 + buf_mem = mmap(NULL, dmabuf_size, PROT_READ | PROT_WRITE, MAP_SHARED, 292 + buf, 0); 293 + if (buf_mem == MAP_FAILED) 294 + error(1, 0, "mmap()"); 295 + 296 + server_sin.sin_family = AF_INET; 297 + server_sin.sin_port = htons(atoi(port)); 298 + 299 + ret = inet_pton(server_sin.sin_family, server_ip, &server_sin.sin_addr); 300 + if (socket < 0) 301 + error(79, 0, "%s: [FAIL, create socket]\n", TEST_PREFIX); 302 + 303 + socket_fd = socket(server_sin.sin_family, SOCK_STREAM, 0); 304 + if (socket < 0) 305 + error(errno, errno, "%s: [FAIL, create socket]\n", TEST_PREFIX); 306 + 307 + ret = setsockopt(socket_fd, SOL_SOCKET, SO_REUSEPORT, &opt, 308 + sizeof(opt)); 309 + if (ret) 310 + error(errno, errno, "%s: [FAIL, set sock opt]\n", TEST_PREFIX); 311 + 312 + ret = setsockopt(socket_fd, SOL_SOCKET, SO_REUSEADDR, &opt, 313 + sizeof(opt)); 314 + if (ret) 315 + error(errno, errno, "%s: [FAIL, set sock opt]\n", TEST_PREFIX); 316 + 317 + printf("binding to address %s:%d\n", server_ip, 318 + ntohs(server_sin.sin_port)); 319 + 320 + ret = bind(socket_fd, &server_sin, sizeof(server_sin)); 321 + if (ret) 322 + error(errno, errno, "%s: [FAIL, bind]\n", TEST_PREFIX); 323 + 324 + ret = listen(socket_fd, 1); 325 + if (ret) 326 + error(errno, errno, "%s: [FAIL, listen]\n", TEST_PREFIX); 327 + 328 + client_addr_len = sizeof(client_addr); 329 + 330 + inet_ntop(server_sin.sin_family, &server_sin.sin_addr, buffer, 331 + sizeof(buffer)); 332 + printf("Waiting or connection on %s:%d\n", buffer, 333 + ntohs(server_sin.sin_port)); 334 + client_fd = accept(socket_fd, &client_addr, &client_addr_len); 335 + 336 + inet_ntop(client_addr.sin_family, &client_addr.sin_addr, buffer, 337 + sizeof(buffer)); 338 + printf("Got connection from %s:%d\n", buffer, 339 + ntohs(client_addr.sin_port)); 340 + 341 + while (1) { 342 + struct iovec iov = { .iov_base = iobuf, 343 + .iov_len = sizeof(iobuf) }; 344 + struct dmabuf_cmsg *dmabuf_cmsg = NULL; 345 + struct dma_buf_sync sync = { 0 }; 346 + struct cmsghdr *cm = NULL; 347 + struct msghdr msg = { 0 }; 348 + struct dmabuf_token token; 349 + ssize_t ret; 350 + 351 + is_devmem = false; 352 + printf("\n\n"); 353 + 354 + msg.msg_iov = &iov; 355 + msg.msg_iovlen = 1; 356 + msg.msg_control = ctrl_data; 357 + msg.msg_controllen = sizeof(ctrl_data); 358 + ret = recvmsg(client_fd, &msg, MSG_SOCK_DEVMEM); 359 + printf("recvmsg ret=%ld\n", ret); 360 + if (ret < 0 && (errno == EAGAIN || errno == EWOULDBLOCK)) 361 + continue; 362 + if (ret < 0) { 363 + perror("recvmsg"); 364 + continue; 365 + } 366 + if (ret == 0) { 367 + printf("client exited\n"); 368 + goto cleanup; 369 + } 370 + 371 + i++; 372 + for (cm = CMSG_FIRSTHDR(&msg); cm; cm = CMSG_NXTHDR(&msg, cm)) { 373 + if (cm->cmsg_level != SOL_SOCKET || 374 + (cm->cmsg_type != SCM_DEVMEM_DMABUF && 375 + cm->cmsg_type != SCM_DEVMEM_LINEAR)) { 376 + fprintf(stdout, "skipping non-devmem cmsg\n"); 377 + continue; 378 + } 379 + 380 + dmabuf_cmsg = (struct dmabuf_cmsg *)CMSG_DATA(cm); 381 + is_devmem = true; 382 + 383 + if (cm->cmsg_type == SCM_DEVMEM_LINEAR) { 384 + /* TODO: process data copied from skb's linear 385 + * buffer. 386 + */ 387 + fprintf(stdout, 388 + "SCM_DEVMEM_LINEAR. dmabuf_cmsg->frag_size=%u\n", 389 + dmabuf_cmsg->frag_size); 390 + 391 + continue; 392 + } 393 + 394 + token.token_start = dmabuf_cmsg->frag_token; 395 + token.token_count = 1; 396 + 397 + total_received += dmabuf_cmsg->frag_size; 398 + printf("received frag_page=%llu, in_page_offset=%llu, frag_offset=%llu, frag_size=%u, token=%u, total_received=%lu, dmabuf_id=%u\n", 399 + dmabuf_cmsg->frag_offset >> PAGE_SHIFT, 400 + dmabuf_cmsg->frag_offset % getpagesize(), 401 + dmabuf_cmsg->frag_offset, dmabuf_cmsg->frag_size, 402 + dmabuf_cmsg->frag_token, total_received, 403 + dmabuf_cmsg->dmabuf_id); 404 + 405 + if (dmabuf_cmsg->dmabuf_id != dmabuf_id) 406 + error(1, 0, 407 + "received on wrong dmabuf_id: flow steering error\n"); 408 + 409 + if (dmabuf_cmsg->frag_size % getpagesize()) 410 + non_page_aligned_frags++; 411 + else 412 + page_aligned_frags++; 413 + 414 + sync.flags = DMA_BUF_SYNC_READ | DMA_BUF_SYNC_START; 415 + ioctl(buf, DMA_BUF_IOCTL_SYNC, &sync); 416 + 417 + if (do_validation) 418 + validate_buffer( 419 + ((unsigned char *)buf_mem) + 420 + dmabuf_cmsg->frag_offset, 421 + dmabuf_cmsg->frag_size); 422 + else 423 + print_nonzero_bytes( 424 + ((unsigned char *)buf_mem) + 425 + dmabuf_cmsg->frag_offset, 426 + dmabuf_cmsg->frag_size); 427 + 428 + sync.flags = DMA_BUF_SYNC_READ | DMA_BUF_SYNC_END; 429 + ioctl(buf, DMA_BUF_IOCTL_SYNC, &sync); 430 + 431 + ret = setsockopt(client_fd, SOL_SOCKET, 432 + SO_DEVMEM_DONTNEED, &token, 433 + sizeof(token)); 434 + if (ret != 1) 435 + error(1, 0, 436 + "SO_DEVMEM_DONTNEED not enough tokens"); 437 + } 438 + if (!is_devmem) 439 + error(1, 0, "flow steering error\n"); 440 + 441 + printf("total_received=%lu\n", total_received); 442 + } 443 + 444 + fprintf(stdout, "%s: ok\n", TEST_PREFIX); 445 + 446 + fprintf(stdout, "page_aligned_frags=%lu, non_page_aligned_frags=%lu\n", 447 + page_aligned_frags, non_page_aligned_frags); 448 + 449 + fprintf(stdout, "page_aligned_frags=%lu, non_page_aligned_frags=%lu\n", 450 + page_aligned_frags, non_page_aligned_frags); 451 + 452 + cleanup: 453 + 454 + munmap(buf_mem, dmabuf_size); 455 + close(client_fd); 456 + close(socket_fd); 457 + close(buf); 458 + close(memfd); 459 + close(devfd); 460 + ynl_sock_destroy(ys); 461 + 462 + return 0; 463 + } 464 + 465 + void run_devmem_tests(void) 466 + { 467 + struct netdev_queue_id *queues; 468 + int devfd, memfd, buf; 469 + struct ynl_sock *ys; 470 + size_t dmabuf_size; 471 + size_t i = 0; 472 + 473 + dmabuf_size = getpagesize() * NUM_PAGES; 474 + 475 + create_udmabuf(&devfd, &memfd, &buf, dmabuf_size); 476 + 477 + /* Configure RSS to divert all traffic from our devmem queues */ 478 + if (configure_rss()) 479 + error(1, 0, "rss error\n"); 480 + 481 + queues = calloc(num_queues, sizeof(*queues)); 482 + 483 + if (configure_headersplit(1)) 484 + error(1, 0, "Failed to configure header split\n"); 485 + 486 + if (!bind_rx_queue(ifindex, buf, queues, num_queues, &ys)) 487 + error(1, 0, "Binding empty queues array should have failed\n"); 488 + 489 + for (i = 0; i < num_queues; i++) { 490 + queues[i]._present.type = 1; 491 + queues[i]._present.id = 1; 492 + queues[i].type = NETDEV_QUEUE_TYPE_RX; 493 + queues[i].id = start_queue + i; 494 + } 495 + 496 + if (configure_headersplit(0)) 497 + error(1, 0, "Failed to configure header split\n"); 498 + 499 + if (!bind_rx_queue(ifindex, buf, queues, num_queues, &ys)) 500 + error(1, 0, "Configure dmabuf with header split off should have failed\n"); 501 + 502 + if (configure_headersplit(1)) 503 + error(1, 0, "Failed to configure header split\n"); 504 + 505 + for (i = 0; i < num_queues; i++) { 506 + queues[i]._present.type = 1; 507 + queues[i]._present.id = 1; 508 + queues[i].type = NETDEV_QUEUE_TYPE_RX; 509 + queues[i].id = start_queue + i; 510 + } 511 + 512 + if (bind_rx_queue(ifindex, buf, queues, num_queues, &ys)) 513 + error(1, 0, "Failed to bind\n"); 514 + 515 + /* Deactivating a bound queue should not be legal */ 516 + if (!configure_channels(num_queues, num_queues - 1)) 517 + error(1, 0, "Deactivating a bound queue should be illegal.\n"); 518 + 519 + /* Closing the netlink socket does an implicit unbind */ 520 + ynl_sock_destroy(ys); 521 + } 522 + 523 + int main(int argc, char *argv[]) 524 + { 525 + int is_server = 0, opt; 526 + 527 + while ((opt = getopt(argc, argv, "ls:c:p:v:q:t:f:")) != -1) { 528 + switch (opt) { 529 + case 'l': 530 + is_server = 1; 531 + break; 532 + case 's': 533 + server_ip = optarg; 534 + break; 535 + case 'c': 536 + client_ip = optarg; 537 + break; 538 + case 'p': 539 + port = optarg; 540 + break; 541 + case 'v': 542 + do_validation = atoll(optarg); 543 + break; 544 + case 'q': 545 + num_queues = atoi(optarg); 546 + break; 547 + case 't': 548 + start_queue = atoi(optarg); 549 + break; 550 + case 'f': 551 + ifname = optarg; 552 + break; 553 + case '?': 554 + printf("unknown option: %c\n", optopt); 555 + break; 556 + } 557 + } 558 + 559 + ifindex = if_nametoindex(ifname); 560 + 561 + for (; optind < argc; optind++) 562 + printf("extra arguments: %s\n", argv[optind]); 563 + 564 + run_devmem_tests(); 565 + 566 + if (is_server) 567 + return do_server(); 568 + 569 + return 0; 570 + }