Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

xsk: Introduce AF_XDP buffer allocation API

In order to simplify AF_XDP zero-copy enablement for NIC driver
developers, a new AF_XDP buffer allocation API is added. The
implementation is based on a single core (single producer/consumer)
buffer pool for the AF_XDP UMEM.

A buffer is allocated using the xsk_buff_alloc() function, and
returned using xsk_buff_free(). If a buffer is disassociated with the
pool, e.g. when a buffer is passed to an AF_XDP socket, a buffer is
said to be released. Currently, the release function is only used by
the AF_XDP internals and not visible to the driver.

Drivers using this API should register the XDP memory model with the
new MEM_TYPE_XSK_BUFF_POOL type.

The API is defined in net/xdp_sock_drv.h.

The buffer type is struct xdp_buff, and follows the lifetime of
regular xdp_buffs, i.e. the lifetime of an xdp_buff is restricted to
a NAPI context. In other words, the API is not replacing xdp_frames.

In addition to introducing the API and implementations, the AF_XDP
core is migrated to use the new APIs.

rfc->v1: Fixed build errors/warnings for m68k and riscv. (kbuild test
robot)
Added headroom/chunk size getter. (Maxim/Björn)

v1->v2: Swapped SoBs. (Maxim)

v2->v3: Initialize struct xdp_buff member frame_sz. (Björn)
Add API to query the DMA address of a frame. (Maxim)
Do DMA sync for CPU till the end of the frame to handle
possible growth (frame_sz). (Maxim)

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Maxim Mikityanskiy <maximmi@mellanox.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200520192103.355233-6-bjorn.topel@gmail.com

authored by

Björn Töpel and committed by
Alexei Starovoitov
2b43470a 89e4a376

+823 -123
+3 -1
include/net/xdp.h
··· 40 40 MEM_TYPE_PAGE_ORDER0, /* Orig XDP full page model */ 41 41 MEM_TYPE_PAGE_POOL, 42 42 MEM_TYPE_ZERO_COPY, 43 + MEM_TYPE_XSK_BUFF_POOL, 43 44 MEM_TYPE_MAX, 44 45 }; 45 46 ··· 120 119 int metasize; 121 120 int headroom; 122 121 123 - if (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY) 122 + if (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY || 123 + xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) 124 124 return xdp_convert_zc_to_xdp_frame(xdp); 125 125 126 126 /* Assure headroom is available for storing info */
+2
include/net/xdp_sock.h
··· 31 31 struct xdp_umem { 32 32 struct xsk_queue *fq; 33 33 struct xsk_queue *cq; 34 + struct xsk_buff_pool *pool; 34 35 struct xdp_umem_page *pages; 35 36 u64 chunk_mask; 36 37 u64 size; 37 38 u32 headroom; 38 39 u32 chunk_size_nohr; 40 + u32 chunk_size; 39 41 struct user_struct *user; 40 42 refcount_t users; 41 43 struct work_struct work;
+164
include/net/xdp_sock_drv.h
··· 7 7 #define _LINUX_XDP_SOCK_DRV_H 8 8 9 9 #include <net/xdp_sock.h> 10 + #include <net/xsk_buff_pool.h> 10 11 11 12 #ifdef CONFIG_XDP_SOCKETS 12 13 ··· 100 99 static inline u32 xsk_umem_xdp_frame_sz(struct xdp_umem *umem) 101 100 { 102 101 return umem->chunk_size_nohr; 102 + } 103 + 104 + static inline u32 xsk_umem_get_headroom(struct xdp_umem *umem) 105 + { 106 + return XDP_PACKET_HEADROOM + umem->headroom; 107 + } 108 + 109 + static inline u32 xsk_umem_get_chunk_size(struct xdp_umem *umem) 110 + { 111 + return umem->chunk_size; 112 + } 113 + 114 + static inline u32 xsk_umem_get_rx_frame_size(struct xdp_umem *umem) 115 + { 116 + return xsk_umem_get_chunk_size(umem) - xsk_umem_get_headroom(umem); 117 + } 118 + 119 + static inline void xsk_buff_set_rxq_info(struct xdp_umem *umem, 120 + struct xdp_rxq_info *rxq) 121 + { 122 + xp_set_rxq_info(umem->pool, rxq); 123 + } 124 + 125 + static inline void xsk_buff_dma_unmap(struct xdp_umem *umem, 126 + unsigned long attrs) 127 + { 128 + xp_dma_unmap(umem->pool, attrs); 129 + } 130 + 131 + static inline int xsk_buff_dma_map(struct xdp_umem *umem, struct device *dev, 132 + unsigned long attrs) 133 + { 134 + return xp_dma_map(umem->pool, dev, attrs, umem->pgs, umem->npgs); 135 + } 136 + 137 + static inline dma_addr_t xsk_buff_xdp_get_dma(struct xdp_buff *xdp) 138 + { 139 + struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); 140 + 141 + return xp_get_dma(xskb); 142 + } 143 + 144 + static inline dma_addr_t xsk_buff_xdp_get_frame_dma(struct xdp_buff *xdp) 145 + { 146 + struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); 147 + 148 + return xp_get_frame_dma(xskb); 149 + } 150 + 151 + static inline struct xdp_buff *xsk_buff_alloc(struct xdp_umem *umem) 152 + { 153 + return xp_alloc(umem->pool); 154 + } 155 + 156 + static inline bool xsk_buff_can_alloc(struct xdp_umem *umem, u32 count) 157 + { 158 + return xp_can_alloc(umem->pool, count); 159 + } 160 + 161 + static inline void xsk_buff_free(struct xdp_buff *xdp) 162 + { 163 + struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); 164 + 165 + xp_free(xskb); 166 + } 167 + 168 + static inline dma_addr_t xsk_buff_raw_get_dma(struct xdp_umem *umem, u64 addr) 169 + { 170 + return xp_raw_get_dma(umem->pool, addr); 171 + } 172 + 173 + static inline void *xsk_buff_raw_get_data(struct xdp_umem *umem, u64 addr) 174 + { 175 + return xp_raw_get_data(umem->pool, addr); 176 + } 177 + 178 + static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp) 179 + { 180 + struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); 181 + 182 + xp_dma_sync_for_cpu(xskb); 183 + } 184 + 185 + static inline void xsk_buff_raw_dma_sync_for_device(struct xdp_umem *umem, 186 + dma_addr_t dma, 187 + size_t size) 188 + { 189 + xp_dma_sync_for_device(umem->pool, dma, size); 103 190 } 104 191 105 192 #else ··· 299 210 static inline u32 xsk_umem_xdp_frame_sz(struct xdp_umem *umem) 300 211 { 301 212 return 0; 213 + } 214 + 215 + static inline u32 xsk_umem_get_headroom(struct xdp_umem *umem) 216 + { 217 + return 0; 218 + } 219 + 220 + static inline u32 xsk_umem_get_chunk_size(struct xdp_umem *umem) 221 + { 222 + return 0; 223 + } 224 + 225 + static inline u32 xsk_umem_get_rx_frame_size(struct xdp_umem *umem) 226 + { 227 + return 0; 228 + } 229 + 230 + static inline void xsk_buff_set_rxq_info(struct xdp_umem *umem, 231 + struct xdp_rxq_info *rxq) 232 + { 233 + } 234 + 235 + static inline void xsk_buff_dma_unmap(struct xdp_umem *umem, 236 + unsigned long attrs) 237 + { 238 + } 239 + 240 + static inline int xsk_buff_dma_map(struct xdp_umem *umem, struct device *dev, 241 + unsigned long attrs) 242 + { 243 + return 0; 244 + } 245 + 246 + static inline dma_addr_t xsk_buff_xdp_get_dma(struct xdp_buff *xdp) 247 + { 248 + return 0; 249 + } 250 + 251 + static inline dma_addr_t xsk_buff_xdp_get_frame_dma(struct xdp_buff *xdp) 252 + { 253 + return 0; 254 + } 255 + 256 + static inline struct xdp_buff *xsk_buff_alloc(struct xdp_umem *umem) 257 + { 258 + return NULL; 259 + } 260 + 261 + static inline bool xsk_buff_can_alloc(struct xdp_umem *umem, u32 count) 262 + { 263 + return false; 264 + } 265 + 266 + static inline void xsk_buff_free(struct xdp_buff *xdp) 267 + { 268 + } 269 + 270 + static inline dma_addr_t xsk_buff_raw_get_dma(struct xdp_umem *umem, u64 addr) 271 + { 272 + return 0; 273 + } 274 + 275 + static inline void *xsk_buff_raw_get_data(struct xdp_umem *umem, u64 addr) 276 + { 277 + return NULL; 278 + } 279 + 280 + static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp) 281 + { 282 + } 283 + 284 + static inline void xsk_buff_raw_dma_sync_for_device(struct xdp_umem *umem, 285 + dma_addr_t dma, 286 + size_t size) 287 + { 302 288 } 303 289 304 290 #endif /* CONFIG_XDP_SOCKETS */
+56
include/net/xsk_buff_pool.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + /* Copyright(c) 2020 Intel Corporation. */ 3 + 4 + #ifndef XSK_BUFF_POOL_H_ 5 + #define XSK_BUFF_POOL_H_ 6 + 7 + #include <linux/types.h> 8 + #include <linux/dma-mapping.h> 9 + #include <net/xdp.h> 10 + 11 + struct xsk_buff_pool; 12 + struct xdp_rxq_info; 13 + struct xsk_queue; 14 + struct xdp_desc; 15 + struct device; 16 + struct page; 17 + 18 + struct xdp_buff_xsk { 19 + struct xdp_buff xdp; 20 + dma_addr_t dma; 21 + dma_addr_t frame_dma; 22 + struct xsk_buff_pool *pool; 23 + bool unaligned; 24 + u64 orig_addr; 25 + struct list_head free_list_node; 26 + }; 27 + 28 + /* AF_XDP core. */ 29 + struct xsk_buff_pool *xp_create(struct page **pages, u32 nr_pages, u32 chunks, 30 + u32 chunk_size, u32 headroom, u64 size, 31 + bool unaligned); 32 + void xp_set_fq(struct xsk_buff_pool *pool, struct xsk_queue *fq); 33 + void xp_destroy(struct xsk_buff_pool *pool); 34 + void xp_release(struct xdp_buff_xsk *xskb); 35 + u64 xp_get_handle(struct xdp_buff_xsk *xskb); 36 + bool xp_validate_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc); 37 + 38 + /* AF_XDP, and XDP core. */ 39 + void xp_free(struct xdp_buff_xsk *xskb); 40 + 41 + /* AF_XDP ZC drivers, via xdp_sock_buff.h */ 42 + void xp_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq); 43 + int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev, 44 + unsigned long attrs, struct page **pages, u32 nr_pages); 45 + void xp_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs); 46 + struct xdp_buff *xp_alloc(struct xsk_buff_pool *pool); 47 + bool xp_can_alloc(struct xsk_buff_pool *pool, u32 count); 48 + void *xp_raw_get_data(struct xsk_buff_pool *pool, u64 addr); 49 + dma_addr_t xp_raw_get_dma(struct xsk_buff_pool *pool, u64 addr); 50 + dma_addr_t xp_get_dma(struct xdp_buff_xsk *xskb); 51 + dma_addr_t xp_get_frame_dma(struct xdp_buff_xsk *xskb); 52 + void xp_dma_sync_for_cpu(struct xdp_buff_xsk *xskb); 53 + void xp_dma_sync_for_device(struct xsk_buff_pool *pool, dma_addr_t dma, 54 + size_t size); 55 + 56 + #endif /* XSK_BUFF_POOL_H_ */
+2 -1
include/trace/events/xdp.h
··· 287 287 FN(PAGE_SHARED) \ 288 288 FN(PAGE_ORDER0) \ 289 289 FN(PAGE_POOL) \ 290 - FN(ZERO_COPY) 290 + FN(ZERO_COPY) \ 291 + FN(XSK_BUFF_POOL) 291 292 292 293 #define __MEM_TYPE_TP_FN(x) \ 293 294 TRACE_DEFINE_ENUM(MEM_TYPE_##x);
+10 -4
net/core/xdp.c
··· 17 17 #include <net/xdp.h> 18 18 #include <net/xdp_priv.h> /* struct xdp_mem_allocator */ 19 19 #include <trace/events/xdp.h> 20 + #include <net/xdp_sock_drv.h> 20 21 21 22 #define REG_STATE_NEW 0x0 22 23 #define REG_STATE_REGISTERED 0x1 ··· 362 361 * of xdp_frames/pages in those cases. 363 362 */ 364 363 static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, 365 - unsigned long handle) 364 + unsigned long handle, struct xdp_buff *xdp) 366 365 { 367 366 struct xdp_mem_allocator *xa; 368 367 struct page *page; ··· 391 390 xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); 392 391 xa->zc_alloc->free(xa->zc_alloc, handle); 393 392 rcu_read_unlock(); 393 + break; 394 + case MEM_TYPE_XSK_BUFF_POOL: 395 + /* NB! Only valid from an xdp_buff! */ 396 + xsk_buff_free(xdp); 397 + break; 394 398 default: 395 399 /* Not possible, checked in xdp_rxq_info_reg_mem_model() */ 396 400 break; ··· 404 398 405 399 void xdp_return_frame(struct xdp_frame *xdpf) 406 400 { 407 - __xdp_return(xdpf->data, &xdpf->mem, false, 0); 401 + __xdp_return(xdpf->data, &xdpf->mem, false, 0, NULL); 408 402 } 409 403 EXPORT_SYMBOL_GPL(xdp_return_frame); 410 404 411 405 void xdp_return_frame_rx_napi(struct xdp_frame *xdpf) 412 406 { 413 - __xdp_return(xdpf->data, &xdpf->mem, true, 0); 407 + __xdp_return(xdpf->data, &xdpf->mem, true, 0, NULL); 414 408 } 415 409 EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi); 416 410 417 411 void xdp_return_buff(struct xdp_buff *xdp) 418 412 { 419 - __xdp_return(xdp->data, &xdp->rxq->mem, true, xdp->handle); 413 + __xdp_return(xdp->data, &xdp->rxq->mem, true, xdp->handle, xdp); 420 414 } 421 415 EXPORT_SYMBOL_GPL(xdp_return_buff); 422 416
+1
net/xdp/Makefile
··· 1 1 # SPDX-License-Identifier: GPL-2.0-only 2 2 obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o xsk_queue.o xskmap.o 3 + obj-$(CONFIG_XDP_SOCKETS) += xsk_buff_pool.o 3 4 obj-$(CONFIG_XDP_SOCKETS_DIAG) += xsk_diag.o
+15 -4
net/xdp/xdp_umem.c
··· 245 245 } 246 246 247 247 xsk_reuseq_destroy(umem); 248 - 248 + xp_destroy(umem->pool); 249 249 xdp_umem_unmap_pages(umem); 250 250 xdp_umem_unpin_pages(umem); 251 251 ··· 390 390 umem->size = size; 391 391 umem->headroom = headroom; 392 392 umem->chunk_size_nohr = chunk_size - headroom; 393 + umem->chunk_size = chunk_size; 393 394 umem->npgs = size / PAGE_SIZE; 394 395 umem->pgs = NULL; 395 396 umem->user = NULL; ··· 416 415 } 417 416 418 417 err = xdp_umem_map_pages(umem); 419 - if (!err) 420 - return 0; 418 + if (err) 419 + goto out_pages; 421 420 421 + umem->pool = xp_create(umem->pgs, umem->npgs, chunks, chunk_size, 422 + headroom, size, unaligned_chunks); 423 + if (!umem->pool) { 424 + err = -ENOMEM; 425 + goto out_unmap; 426 + } 427 + return 0; 428 + 429 + out_unmap: 430 + xdp_umem_unmap_pages(umem); 431 + out_pages: 422 432 kvfree(umem->pages); 423 - 424 433 out_pin: 425 434 xdp_umem_unpin_pages(umem); 426 435 out_account:
+55 -94
net/xdp/xsk.c
··· 117 117 } 118 118 EXPORT_SYMBOL(xsk_umem_uses_need_wakeup); 119 119 120 - /* If a buffer crosses a page boundary, we need to do 2 memcpy's, one for 121 - * each page. This is only required in copy mode. 122 - */ 123 - static void __xsk_rcv_memcpy(struct xdp_umem *umem, u64 addr, void *from_buf, 124 - u32 len, u32 metalen) 120 + static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) 125 121 { 126 - void *to_buf = xdp_umem_get_data(umem, addr); 122 + struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); 123 + u64 addr; 124 + int err; 127 125 128 - addr = xsk_umem_add_offset_to_addr(addr); 129 - if (xskq_cons_crosses_non_contig_pg(umem, addr, len + metalen)) { 130 - void *next_pg_addr = umem->pages[(addr >> PAGE_SHIFT) + 1].addr; 131 - u64 page_start = addr & ~(PAGE_SIZE - 1); 132 - u64 first_len = PAGE_SIZE - (addr - page_start); 126 + addr = xp_get_handle(xskb); 127 + err = xskq_prod_reserve_desc(xs->rx, addr, len); 128 + if (err) { 129 + xs->rx_dropped++; 130 + return err; 131 + } 133 132 134 - memcpy(to_buf, from_buf, first_len); 135 - memcpy(next_pg_addr, from_buf + first_len, 136 - len + metalen - first_len); 133 + xp_release(xskb); 134 + return 0; 135 + } 137 136 138 - return; 137 + static void xsk_copy_xdp(struct xdp_buff *to, struct xdp_buff *from, u32 len) 138 + { 139 + void *from_buf, *to_buf; 140 + u32 metalen; 141 + 142 + if (unlikely(xdp_data_meta_unsupported(from))) { 143 + from_buf = from->data; 144 + to_buf = to->data; 145 + metalen = 0; 146 + } else { 147 + from_buf = from->data_meta; 148 + metalen = from->data - from->data_meta; 149 + to_buf = to->data - metalen; 139 150 } 140 151 141 152 memcpy(to_buf, from_buf, len + metalen); 142 153 } 143 154 144 - static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) 155 + static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len, 156 + bool explicit_free) 145 157 { 146 - u64 offset = xs->umem->headroom; 147 - u64 addr, memcpy_addr; 148 - void *from_buf; 149 - u32 metalen; 158 + struct xdp_buff *xsk_xdp; 150 159 int err; 151 160 152 - if (!xskq_cons_peek_addr(xs->umem->fq, &addr, xs->umem) || 153 - len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) { 161 + if (len > xsk_umem_get_rx_frame_size(xs->umem)) { 154 162 xs->rx_dropped++; 155 163 return -ENOSPC; 156 164 } 157 165 158 - if (unlikely(xdp_data_meta_unsupported(xdp))) { 159 - from_buf = xdp->data; 160 - metalen = 0; 161 - } else { 162 - from_buf = xdp->data_meta; 163 - metalen = xdp->data - xdp->data_meta; 164 - } 165 - 166 - memcpy_addr = xsk_umem_adjust_offset(xs->umem, addr, offset); 167 - __xsk_rcv_memcpy(xs->umem, memcpy_addr, from_buf, len, metalen); 168 - 169 - offset += metalen; 170 - addr = xsk_umem_adjust_offset(xs->umem, addr, offset); 171 - err = xskq_prod_reserve_desc(xs->rx, addr, len); 172 - if (!err) { 173 - xskq_cons_release(xs->umem->fq); 174 - xdp_return_buff(xdp); 175 - return 0; 176 - } 177 - 178 - xs->rx_dropped++; 179 - return err; 180 - } 181 - 182 - static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) 183 - { 184 - int err = xskq_prod_reserve_desc(xs->rx, xdp->handle, len); 185 - 186 - if (err) 166 + xsk_xdp = xsk_buff_alloc(xs->umem); 167 + if (!xsk_xdp) { 187 168 xs->rx_dropped++; 169 + return -ENOSPC; 170 + } 188 171 189 - return err; 172 + xsk_copy_xdp(xsk_xdp, xdp, len); 173 + err = __xsk_rcv_zc(xs, xsk_xdp, len); 174 + if (err) { 175 + xsk_buff_free(xsk_xdp); 176 + return err; 177 + } 178 + if (explicit_free) 179 + xdp_return_buff(xdp); 180 + return 0; 190 181 } 191 182 192 183 static bool xsk_is_bound(struct xdp_sock *xs) ··· 190 199 return false; 191 200 } 192 201 193 - static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) 202 + static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, 203 + bool explicit_free) 194 204 { 195 205 u32 len; 196 206 ··· 203 211 204 212 len = xdp->data_end - xdp->data; 205 213 206 - return (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY) ? 207 - __xsk_rcv_zc(xs, xdp, len) : __xsk_rcv(xs, xdp, len); 214 + return xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY || 215 + xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL ? 216 + __xsk_rcv_zc(xs, xdp, len) : 217 + __xsk_rcv(xs, xdp, len, explicit_free); 208 218 } 209 219 210 220 static void xsk_flush(struct xdp_sock *xs) ··· 218 224 219 225 int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) 220 226 { 221 - u32 metalen = xdp->data - xdp->data_meta; 222 - u32 len = xdp->data_end - xdp->data; 223 - u64 offset = xs->umem->headroom; 224 - void *buffer; 225 - u64 addr; 226 227 int err; 227 228 228 229 spin_lock_bh(&xs->rx_lock); 229 - 230 - if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) { 231 - err = -EINVAL; 232 - goto out_unlock; 233 - } 234 - 235 - if (!xskq_cons_peek_addr(xs->umem->fq, &addr, xs->umem) || 236 - len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) { 237 - err = -ENOSPC; 238 - goto out_drop; 239 - } 240 - 241 - addr = xsk_umem_adjust_offset(xs->umem, addr, offset); 242 - buffer = xdp_umem_get_data(xs->umem, addr); 243 - memcpy(buffer, xdp->data_meta, len + metalen); 244 - 245 - addr = xsk_umem_adjust_offset(xs->umem, addr, metalen); 246 - err = xskq_prod_reserve_desc(xs->rx, addr, len); 247 - if (err) 248 - goto out_drop; 249 - 250 - xskq_cons_release(xs->umem->fq); 251 - xskq_prod_submit(xs->rx); 252 - 253 - spin_unlock_bh(&xs->rx_lock); 254 - 255 - xs->sk.sk_data_ready(&xs->sk); 256 - return 0; 257 - 258 - out_drop: 259 - xs->rx_dropped++; 260 - out_unlock: 230 + err = xsk_rcv(xs, xdp, false); 231 + xsk_flush(xs); 261 232 spin_unlock_bh(&xs->rx_lock); 262 233 return err; 263 234 } ··· 232 273 struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list); 233 274 int err; 234 275 235 - err = xsk_rcv(xs, xdp); 276 + err = xsk_rcv(xs, xdp, true); 236 277 if (err) 237 278 return err; 238 279 ··· 363 404 364 405 skb_put(skb, len); 365 406 addr = desc.addr; 366 - buffer = xdp_umem_get_data(xs->umem, addr); 407 + buffer = xsk_buff_raw_get_data(xs->umem, addr); 367 408 err = skb_store_bits(skb, 0, buffer, len); 368 409 /* This is the backpressure mechanism for the Tx path. 369 410 * Reserve space in the completion queue and only proceed ··· 819 860 q = (optname == XDP_UMEM_FILL_RING) ? &xs->umem->fq : 820 861 &xs->umem->cq; 821 862 err = xsk_init_queue(entries, q, true); 863 + if (optname == XDP_UMEM_FILL_RING) 864 + xp_set_fq(xs->umem->pool, *q); 822 865 mutex_unlock(&xs->mutex); 823 866 return err; 824 867 }
+467
net/xdp/xsk_buff_pool.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + 3 + #include <net/xsk_buff_pool.h> 4 + #include <net/xdp_sock.h> 5 + #include <linux/dma-direct.h> 6 + #include <linux/dma-noncoherent.h> 7 + #include <linux/swiotlb.h> 8 + 9 + #include "xsk_queue.h" 10 + 11 + struct xsk_buff_pool { 12 + struct xsk_queue *fq; 13 + struct list_head free_list; 14 + dma_addr_t *dma_pages; 15 + struct xdp_buff_xsk *heads; 16 + u64 chunk_mask; 17 + u64 addrs_cnt; 18 + u32 free_list_cnt; 19 + u32 dma_pages_cnt; 20 + u32 heads_cnt; 21 + u32 free_heads_cnt; 22 + u32 headroom; 23 + u32 chunk_size; 24 + u32 frame_len; 25 + bool cheap_dma; 26 + bool unaligned; 27 + void *addrs; 28 + struct device *dev; 29 + struct xdp_buff_xsk *free_heads[]; 30 + }; 31 + 32 + static void xp_addr_unmap(struct xsk_buff_pool *pool) 33 + { 34 + vunmap(pool->addrs); 35 + } 36 + 37 + static int xp_addr_map(struct xsk_buff_pool *pool, 38 + struct page **pages, u32 nr_pages) 39 + { 40 + pool->addrs = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); 41 + if (!pool->addrs) 42 + return -ENOMEM; 43 + return 0; 44 + } 45 + 46 + void xp_destroy(struct xsk_buff_pool *pool) 47 + { 48 + if (!pool) 49 + return; 50 + 51 + xp_addr_unmap(pool); 52 + kvfree(pool->heads); 53 + kvfree(pool); 54 + } 55 + 56 + struct xsk_buff_pool *xp_create(struct page **pages, u32 nr_pages, u32 chunks, 57 + u32 chunk_size, u32 headroom, u64 size, 58 + bool unaligned) 59 + { 60 + struct xsk_buff_pool *pool; 61 + struct xdp_buff_xsk *xskb; 62 + int err; 63 + u32 i; 64 + 65 + pool = kvzalloc(struct_size(pool, free_heads, chunks), GFP_KERNEL); 66 + if (!pool) 67 + goto out; 68 + 69 + pool->heads = kvcalloc(chunks, sizeof(*pool->heads), GFP_KERNEL); 70 + if (!pool->heads) 71 + goto out; 72 + 73 + pool->chunk_mask = ~((u64)chunk_size - 1); 74 + pool->addrs_cnt = size; 75 + pool->heads_cnt = chunks; 76 + pool->free_heads_cnt = chunks; 77 + pool->headroom = headroom; 78 + pool->chunk_size = chunk_size; 79 + pool->cheap_dma = true; 80 + pool->unaligned = unaligned; 81 + pool->frame_len = chunk_size - headroom - XDP_PACKET_HEADROOM; 82 + INIT_LIST_HEAD(&pool->free_list); 83 + 84 + for (i = 0; i < pool->free_heads_cnt; i++) { 85 + xskb = &pool->heads[i]; 86 + xskb->pool = pool; 87 + xskb->xdp.frame_sz = chunk_size - headroom; 88 + pool->free_heads[i] = xskb; 89 + } 90 + 91 + err = xp_addr_map(pool, pages, nr_pages); 92 + if (!err) 93 + return pool; 94 + 95 + out: 96 + xp_destroy(pool); 97 + return NULL; 98 + } 99 + 100 + void xp_set_fq(struct xsk_buff_pool *pool, struct xsk_queue *fq) 101 + { 102 + pool->fq = fq; 103 + } 104 + 105 + void xp_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq) 106 + { 107 + u32 i; 108 + 109 + for (i = 0; i < pool->heads_cnt; i++) 110 + pool->heads[i].xdp.rxq = rxq; 111 + } 112 + EXPORT_SYMBOL(xp_set_rxq_info); 113 + 114 + void xp_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs) 115 + { 116 + dma_addr_t *dma; 117 + u32 i; 118 + 119 + if (pool->dma_pages_cnt == 0) 120 + return; 121 + 122 + for (i = 0; i < pool->dma_pages_cnt; i++) { 123 + dma = &pool->dma_pages[i]; 124 + if (*dma) { 125 + dma_unmap_page_attrs(pool->dev, *dma, PAGE_SIZE, 126 + DMA_BIDIRECTIONAL, attrs); 127 + *dma = 0; 128 + } 129 + } 130 + 131 + kvfree(pool->dma_pages); 132 + pool->dma_pages_cnt = 0; 133 + pool->dev = NULL; 134 + } 135 + EXPORT_SYMBOL(xp_dma_unmap); 136 + 137 + static void xp_check_dma_contiguity(struct xsk_buff_pool *pool) 138 + { 139 + u32 i; 140 + 141 + for (i = 0; i < pool->dma_pages_cnt - 1; i++) { 142 + if (pool->dma_pages[i] + PAGE_SIZE == pool->dma_pages[i + 1]) 143 + pool->dma_pages[i] |= XSK_NEXT_PG_CONTIG_MASK; 144 + else 145 + pool->dma_pages[i] &= ~XSK_NEXT_PG_CONTIG_MASK; 146 + } 147 + } 148 + 149 + static bool __maybe_unused xp_check_swiotlb_dma(struct xsk_buff_pool *pool) 150 + { 151 + #if defined(CONFIG_SWIOTLB) 152 + phys_addr_t paddr; 153 + u32 i; 154 + 155 + for (i = 0; i < pool->dma_pages_cnt; i++) { 156 + paddr = dma_to_phys(pool->dev, pool->dma_pages[i]); 157 + if (is_swiotlb_buffer(paddr)) 158 + return false; 159 + } 160 + #endif 161 + return true; 162 + } 163 + 164 + static bool xp_check_cheap_dma(struct xsk_buff_pool *pool) 165 + { 166 + #if defined(CONFIG_HAS_DMA) 167 + const struct dma_map_ops *ops = get_dma_ops(pool->dev); 168 + 169 + if (ops) { 170 + return !ops->sync_single_for_cpu && 171 + !ops->sync_single_for_device; 172 + } 173 + 174 + if (!dma_is_direct(ops)) 175 + return false; 176 + 177 + if (!xp_check_swiotlb_dma(pool)) 178 + return false; 179 + 180 + if (!dev_is_dma_coherent(pool->dev)) { 181 + #if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \ 182 + defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) || \ 183 + defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) 184 + return false; 185 + #endif 186 + } 187 + #endif 188 + return true; 189 + } 190 + 191 + int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev, 192 + unsigned long attrs, struct page **pages, u32 nr_pages) 193 + { 194 + dma_addr_t dma; 195 + u32 i; 196 + 197 + pool->dma_pages = kvcalloc(nr_pages, sizeof(*pool->dma_pages), 198 + GFP_KERNEL); 199 + if (!pool->dma_pages) 200 + return -ENOMEM; 201 + 202 + pool->dev = dev; 203 + pool->dma_pages_cnt = nr_pages; 204 + 205 + for (i = 0; i < pool->dma_pages_cnt; i++) { 206 + dma = dma_map_page_attrs(dev, pages[i], 0, PAGE_SIZE, 207 + DMA_BIDIRECTIONAL, attrs); 208 + if (dma_mapping_error(dev, dma)) { 209 + xp_dma_unmap(pool, attrs); 210 + return -ENOMEM; 211 + } 212 + pool->dma_pages[i] = dma; 213 + } 214 + 215 + if (pool->unaligned) 216 + xp_check_dma_contiguity(pool); 217 + 218 + pool->dev = dev; 219 + pool->cheap_dma = xp_check_cheap_dma(pool); 220 + return 0; 221 + } 222 + EXPORT_SYMBOL(xp_dma_map); 223 + 224 + static bool xp_desc_crosses_non_contig_pg(struct xsk_buff_pool *pool, 225 + u64 addr, u32 len) 226 + { 227 + bool cross_pg = (addr & (PAGE_SIZE - 1)) + len > PAGE_SIZE; 228 + 229 + if (pool->dma_pages_cnt && cross_pg) { 230 + return !(pool->dma_pages[addr >> PAGE_SHIFT] & 231 + XSK_NEXT_PG_CONTIG_MASK); 232 + } 233 + return false; 234 + } 235 + 236 + static bool xp_addr_crosses_non_contig_pg(struct xsk_buff_pool *pool, 237 + u64 addr) 238 + { 239 + return xp_desc_crosses_non_contig_pg(pool, addr, pool->chunk_size); 240 + } 241 + 242 + void xp_release(struct xdp_buff_xsk *xskb) 243 + { 244 + xskb->pool->free_heads[xskb->pool->free_heads_cnt++] = xskb; 245 + } 246 + 247 + static u64 xp_aligned_extract_addr(struct xsk_buff_pool *pool, u64 addr) 248 + { 249 + return addr & pool->chunk_mask; 250 + } 251 + 252 + static u64 xp_unaligned_extract_addr(u64 addr) 253 + { 254 + return addr & XSK_UNALIGNED_BUF_ADDR_MASK; 255 + } 256 + 257 + static u64 xp_unaligned_extract_offset(u64 addr) 258 + { 259 + return addr >> XSK_UNALIGNED_BUF_OFFSET_SHIFT; 260 + } 261 + 262 + static u64 xp_unaligned_add_offset_to_addr(u64 addr) 263 + { 264 + return xp_unaligned_extract_addr(addr) + 265 + xp_unaligned_extract_offset(addr); 266 + } 267 + 268 + static bool xp_check_unaligned(struct xsk_buff_pool *pool, u64 *addr) 269 + { 270 + *addr = xp_unaligned_extract_addr(*addr); 271 + if (*addr >= pool->addrs_cnt || 272 + *addr + pool->chunk_size > pool->addrs_cnt || 273 + xp_addr_crosses_non_contig_pg(pool, *addr)) 274 + return false; 275 + return true; 276 + } 277 + 278 + static bool xp_check_aligned(struct xsk_buff_pool *pool, u64 *addr) 279 + { 280 + *addr = xp_aligned_extract_addr(pool, *addr); 281 + return *addr < pool->addrs_cnt; 282 + } 283 + 284 + static struct xdp_buff_xsk *__xp_alloc(struct xsk_buff_pool *pool) 285 + { 286 + struct xdp_buff_xsk *xskb; 287 + u64 addr; 288 + bool ok; 289 + 290 + if (pool->free_heads_cnt == 0) 291 + return NULL; 292 + 293 + xskb = pool->free_heads[--pool->free_heads_cnt]; 294 + 295 + for (;;) { 296 + if (!xskq_cons_peek_addr_unchecked(pool->fq, &addr)) { 297 + xp_release(xskb); 298 + return NULL; 299 + } 300 + 301 + ok = pool->unaligned ? xp_check_unaligned(pool, &addr) : 302 + xp_check_aligned(pool, &addr); 303 + if (!ok) { 304 + pool->fq->invalid_descs++; 305 + xskq_cons_release(pool->fq); 306 + continue; 307 + } 308 + break; 309 + } 310 + xskq_cons_release(pool->fq); 311 + 312 + xskb->orig_addr = addr; 313 + xskb->xdp.data_hard_start = pool->addrs + addr + pool->headroom; 314 + if (pool->dma_pages_cnt) { 315 + xskb->frame_dma = (pool->dma_pages[addr >> PAGE_SHIFT] & 316 + ~XSK_NEXT_PG_CONTIG_MASK) + 317 + (addr & ~PAGE_MASK); 318 + xskb->dma = xskb->frame_dma + pool->headroom + 319 + XDP_PACKET_HEADROOM; 320 + } 321 + return xskb; 322 + } 323 + 324 + struct xdp_buff *xp_alloc(struct xsk_buff_pool *pool) 325 + { 326 + struct xdp_buff_xsk *xskb; 327 + 328 + if (!pool->free_list_cnt) { 329 + xskb = __xp_alloc(pool); 330 + if (!xskb) 331 + return NULL; 332 + } else { 333 + pool->free_list_cnt--; 334 + xskb = list_first_entry(&pool->free_list, struct xdp_buff_xsk, 335 + free_list_node); 336 + list_del(&xskb->free_list_node); 337 + } 338 + 339 + xskb->xdp.data = xskb->xdp.data_hard_start + XDP_PACKET_HEADROOM; 340 + xskb->xdp.data_meta = xskb->xdp.data; 341 + 342 + if (!pool->cheap_dma) { 343 + dma_sync_single_range_for_device(pool->dev, xskb->dma, 0, 344 + pool->frame_len, 345 + DMA_BIDIRECTIONAL); 346 + } 347 + return &xskb->xdp; 348 + } 349 + EXPORT_SYMBOL(xp_alloc); 350 + 351 + bool xp_can_alloc(struct xsk_buff_pool *pool, u32 count) 352 + { 353 + if (pool->free_list_cnt >= count) 354 + return true; 355 + return xskq_cons_has_entries(pool->fq, count - pool->free_list_cnt); 356 + } 357 + EXPORT_SYMBOL(xp_can_alloc); 358 + 359 + void xp_free(struct xdp_buff_xsk *xskb) 360 + { 361 + xskb->pool->free_list_cnt++; 362 + list_add(&xskb->free_list_node, &xskb->pool->free_list); 363 + } 364 + EXPORT_SYMBOL(xp_free); 365 + 366 + static bool xp_aligned_validate_desc(struct xsk_buff_pool *pool, 367 + struct xdp_desc *desc) 368 + { 369 + u64 chunk, chunk_end; 370 + 371 + chunk = xp_aligned_extract_addr(pool, desc->addr); 372 + chunk_end = xp_aligned_extract_addr(pool, desc->addr + desc->len); 373 + if (chunk != chunk_end) 374 + return false; 375 + 376 + if (chunk >= pool->addrs_cnt) 377 + return false; 378 + 379 + if (desc->options) 380 + return false; 381 + return true; 382 + } 383 + 384 + static bool xp_unaligned_validate_desc(struct xsk_buff_pool *pool, 385 + struct xdp_desc *desc) 386 + { 387 + u64 addr, base_addr; 388 + 389 + base_addr = xp_unaligned_extract_addr(desc->addr); 390 + addr = xp_unaligned_add_offset_to_addr(desc->addr); 391 + 392 + if (desc->len > pool->chunk_size) 393 + return false; 394 + 395 + if (base_addr >= pool->addrs_cnt || addr >= pool->addrs_cnt || 396 + xp_desc_crosses_non_contig_pg(pool, addr, desc->len)) 397 + return false; 398 + 399 + if (desc->options) 400 + return false; 401 + return true; 402 + } 403 + 404 + bool xp_validate_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc) 405 + { 406 + return pool->unaligned ? xp_unaligned_validate_desc(pool, desc) : 407 + xp_aligned_validate_desc(pool, desc); 408 + } 409 + 410 + u64 xp_get_handle(struct xdp_buff_xsk *xskb) 411 + { 412 + u64 offset = xskb->xdp.data - xskb->xdp.data_hard_start; 413 + 414 + offset += xskb->pool->headroom; 415 + if (!xskb->pool->unaligned) 416 + return xskb->orig_addr + offset; 417 + return xskb->orig_addr + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT); 418 + } 419 + 420 + void *xp_raw_get_data(struct xsk_buff_pool *pool, u64 addr) 421 + { 422 + addr = pool->unaligned ? xp_unaligned_add_offset_to_addr(addr) : addr; 423 + return pool->addrs + addr; 424 + } 425 + EXPORT_SYMBOL(xp_raw_get_data); 426 + 427 + dma_addr_t xp_raw_get_dma(struct xsk_buff_pool *pool, u64 addr) 428 + { 429 + addr = pool->unaligned ? xp_unaligned_add_offset_to_addr(addr) : addr; 430 + return (pool->dma_pages[addr >> PAGE_SHIFT] & 431 + ~XSK_NEXT_PG_CONTIG_MASK) + 432 + (addr & ~PAGE_MASK); 433 + } 434 + EXPORT_SYMBOL(xp_raw_get_dma); 435 + 436 + dma_addr_t xp_get_dma(struct xdp_buff_xsk *xskb) 437 + { 438 + return xskb->dma; 439 + } 440 + EXPORT_SYMBOL(xp_get_dma); 441 + 442 + dma_addr_t xp_get_frame_dma(struct xdp_buff_xsk *xskb) 443 + { 444 + return xskb->frame_dma; 445 + } 446 + EXPORT_SYMBOL(xp_get_frame_dma); 447 + 448 + void xp_dma_sync_for_cpu(struct xdp_buff_xsk *xskb) 449 + { 450 + if (xskb->pool->cheap_dma) 451 + return; 452 + 453 + dma_sync_single_range_for_cpu(xskb->pool->dev, xskb->dma, 0, 454 + xskb->pool->frame_len, DMA_BIDIRECTIONAL); 455 + } 456 + EXPORT_SYMBOL(xp_dma_sync_for_cpu); 457 + 458 + void xp_dma_sync_for_device(struct xsk_buff_pool *pool, dma_addr_t dma, 459 + size_t size) 460 + { 461 + if (pool->cheap_dma) 462 + return; 463 + 464 + dma_sync_single_range_for_device(pool->dev, dma, 0, 465 + size, DMA_BIDIRECTIONAL); 466 + } 467 + EXPORT_SYMBOL(xp_dma_sync_for_device);
+1 -1
net/xdp/xsk_diag.c
··· 56 56 du.id = umem->id; 57 57 du.size = umem->size; 58 58 du.num_pages = umem->npgs; 59 - du.chunk_size = umem->chunk_size_nohr + umem->headroom; 59 + du.chunk_size = umem->chunk_size; 60 60 du.headroom = umem->headroom; 61 61 du.ifindex = umem->dev ? umem->dev->ifindex : 0; 62 62 du.queue_id = umem->queue_id;
+47 -18
net/xdp/xsk_queue.h
··· 9 9 #include <linux/types.h> 10 10 #include <linux/if_xdp.h> 11 11 #include <net/xdp_sock.h> 12 + #include <net/xsk_buff_pool.h> 12 13 13 14 #include "xsk.h" 14 15 ··· 173 172 return false; 174 173 } 175 174 175 + static inline bool xskq_cons_read_addr_aligned(struct xsk_queue *q, u64 *addr) 176 + { 177 + struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; 178 + 179 + while (q->cached_cons != q->cached_prod) { 180 + u32 idx = q->cached_cons & q->ring_mask; 181 + 182 + *addr = ring->desc[idx]; 183 + if (xskq_cons_is_valid_addr(q, *addr)) 184 + return true; 185 + 186 + q->cached_cons++; 187 + } 188 + 189 + return false; 190 + } 191 + 192 + static inline bool xskq_cons_read_addr_unchecked(struct xsk_queue *q, u64 *addr) 193 + { 194 + struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; 195 + 196 + if (q->cached_cons != q->cached_prod) { 197 + u32 idx = q->cached_cons & q->ring_mask; 198 + 199 + *addr = ring->desc[idx]; 200 + return true; 201 + } 202 + 203 + return false; 204 + } 205 + 176 206 static inline bool xskq_cons_is_valid_desc(struct xsk_queue *q, 177 207 struct xdp_desc *d, 178 208 struct xdp_umem *umem) 179 209 { 180 - if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG) { 181 - if (!xskq_cons_is_valid_unaligned(q, d->addr, d->len, umem)) 182 - return false; 183 - 184 - if (d->len > umem->chunk_size_nohr || d->options) { 185 - q->invalid_descs++; 186 - return false; 187 - } 188 - 189 - return true; 190 - } 191 - 192 - if (!xskq_cons_is_valid_addr(q, d->addr)) 193 - return false; 194 - 195 - if (((d->addr + d->len) & q->chunk_mask) != (d->addr & q->chunk_mask) || 196 - d->options) { 210 + if (!xp_validate_desc(umem->pool, d)) { 197 211 q->invalid_descs++; 198 212 return false; 199 213 } 200 - 201 214 return true; 202 215 } 203 216 ··· 273 258 if (q->cached_prod == q->cached_cons) 274 259 xskq_cons_get_entries(q); 275 260 return xskq_cons_read_addr(q, addr, umem); 261 + } 262 + 263 + static inline bool xskq_cons_peek_addr_aligned(struct xsk_queue *q, u64 *addr) 264 + { 265 + if (q->cached_prod == q->cached_cons) 266 + xskq_cons_get_entries(q); 267 + return xskq_cons_read_addr_aligned(q, addr); 268 + } 269 + 270 + static inline bool xskq_cons_peek_addr_unchecked(struct xsk_queue *q, u64 *addr) 271 + { 272 + if (q->cached_prod == q->cached_cons) 273 + xskq_cons_get_entries(q); 274 + return xskq_cons_read_addr_unchecked(q, addr); 276 275 } 277 276 278 277 static inline bool xskq_cons_peek_desc(struct xsk_queue *q,