Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'xsk-the-lost-bits-from-chapter-iii'

Alexander Lobakin says:

====================
xsk: the lost bits from Chapter III

Before introducing libeth_xdp, we need to add a couple more generic
helpers. Notably:

* 01: add generic loop unrolling hint helpers;
* 04: add helper to get both xdp_desc's DMA address and metadata
pointer in one go, saving several cycles and hotpath object
code size in drivers (especially when unrolling).

Bonus:

* 02, 03: convert two drivers which were using custom macros to
generic unrolled_count() (trivial, no object code changes).
====================

Link: https://patch.msgid.link/20250206182630.3914318-1-aleksander.lobakin@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

+141 -26
+3 -1
drivers/net/ethernet/intel/i40e/i40e_xsk.c
··· 2 2 /* Copyright(c) 2018 Intel Corporation. */ 3 3 4 4 #include <linux/bpf_trace.h> 5 + #include <linux/unroll.h> 5 6 #include <net/xdp_sock_drv.h> 6 7 #include "i40e_txrx_common.h" 7 8 #include "i40e_xsk.h" ··· 530 529 dma_addr_t dma; 531 530 u32 i; 532 531 533 - loop_unrolled_for(i = 0; i < PKTS_PER_BATCH; i++) { 532 + unrolled_count(PKTS_PER_BATCH) 533 + for (i = 0; i < PKTS_PER_BATCH; i++) { 534 534 u32 cmd = I40E_TX_DESC_CMD_ICRC | xsk_is_eop_desc(&desc[i]); 535 535 536 536 dma = xsk_buff_raw_get_dma(xdp_ring->xsk_pool, desc[i].addr);
+1 -9
drivers/net/ethernet/intel/i40e/i40e_xsk.h
··· 6 6 7 7 #include <linux/types.h> 8 8 9 - /* This value should match the pragma in the loop_unrolled_for 9 + /* This value should match the pragma in the unrolled_count() 10 10 * macro. Why 4? It is strictly empirical. It seems to be a good 11 11 * compromise between the advantage of having simultaneous outstanding 12 12 * reads to the DMA array that can hide each others latency and the 13 13 * disadvantage of having a larger code path. 14 14 */ 15 15 #define PKTS_PER_BATCH 4 16 - 17 - #ifdef __clang__ 18 - #define loop_unrolled_for _Pragma("clang loop unroll_count(4)") for 19 - #elif __GNUC__ >= 8 20 - #define loop_unrolled_for _Pragma("GCC unroll 4") for 21 - #else 22 - #define loop_unrolled_for for 23 - #endif 24 16 25 17 struct i40e_ring; 26 18 struct i40e_vsi;
+3 -1
drivers/net/ethernet/intel/ice/ice_xsk.c
··· 2 2 /* Copyright (c) 2019, Intel Corporation. */ 3 3 4 4 #include <linux/bpf_trace.h> 5 + #include <linux/unroll.h> 5 6 #include <net/xdp_sock_drv.h> 6 7 #include <net/xdp.h> 7 8 #include "ice.h" ··· 990 989 struct ice_tx_desc *tx_desc; 991 990 u32 i; 992 991 993 - loop_unrolled_for(i = 0; i < PKTS_PER_BATCH; i++) { 992 + unrolled_count(PKTS_PER_BATCH) 993 + for (i = 0; i < PKTS_PER_BATCH; i++) { 994 994 dma_addr_t dma; 995 995 996 996 dma = xsk_buff_raw_get_dma(xsk_pool, descs[i].addr);
-8
drivers/net/ethernet/intel/ice/ice_xsk.h
··· 7 7 8 8 #define PKTS_PER_BATCH 8 9 9 10 - #ifdef __clang__ 11 - #define loop_unrolled_for _Pragma("clang loop unroll_count(8)") for 12 - #elif __GNUC__ >= 8 13 - #define loop_unrolled_for _Pragma("GCC unroll 8") for 14 - #else 15 - #define loop_unrolled_for for 16 - #endif 17 - 18 10 struct ice_vsi; 19 11 20 12 #ifdef CONFIG_XDP_SOCKETS
+44
include/linux/unroll.h
··· 9 9 10 10 #include <linux/args.h> 11 11 12 + #ifdef CONFIG_CC_IS_CLANG 13 + #define __pick_unrolled(x, y) _Pragma(#x) 14 + #elif CONFIG_GCC_VERSION >= 80000 15 + #define __pick_unrolled(x, y) _Pragma(#y) 16 + #else 17 + #define __pick_unrolled(x, y) /* not supported */ 18 + #endif 19 + 20 + /** 21 + * unrolled - loop attributes to ask the compiler to unroll it 22 + * 23 + * Usage: 24 + * 25 + * #define BATCH 8 26 + * 27 + * unrolled_count(BATCH) 28 + * for (u32 i = 0; i < BATCH; i++) 29 + * // loop body without cross-iteration dependencies 30 + * 31 + * This is only a hint and the compiler is free to disable unrolling if it 32 + * thinks the count is suboptimal and may hurt performance and/or hugely 33 + * increase object code size. 34 + * Not having any cross-iteration dependencies (i.e. when iter x + 1 depends 35 + * on what iter x will do with variables) is not a strict requirement, but 36 + * provides best performance and object code size. 37 + * Available only on Clang and GCC 8.x onwards. 38 + */ 39 + 40 + /* Ask the compiler to pick an optimal unroll count, Clang only */ 41 + #define unrolled \ 42 + __pick_unrolled(clang loop unroll(enable), /* nothing */) 43 + 44 + /* Unroll each @n iterations of the loop */ 45 + #define unrolled_count(n) \ 46 + __pick_unrolled(clang loop unroll_count(n), GCC unroll n) 47 + 48 + /* Unroll the whole loop */ 49 + #define unrolled_full \ 50 + __pick_unrolled(clang loop unroll(full), GCC unroll 65534) 51 + 52 + /* Never unroll the loop */ 53 + #define unrolled_none \ 54 + __pick_unrolled(clang loop unroll(disable), GCC unroll 1) 55 + 12 56 #define UNROLL(N, MACRO, args...) CONCATENATE(__UNROLL_, N)(MACRO, args) 13 57 14 58 #define __UNROLL_0(MACRO, args...)
+40 -3
include/net/xdp_sock_drv.h
··· 196 196 return xp_raw_get_data(pool, addr); 197 197 } 198 198 199 + /** 200 + * xsk_buff_raw_get_ctx - get &xdp_desc context 201 + * @pool: XSk buff pool desc address belongs to 202 + * @addr: desc address (from userspace) 203 + * 204 + * Wrapper for xp_raw_get_ctx() to be used in drivers, see its kdoc for 205 + * details. 206 + * 207 + * Return: new &xdp_desc_ctx struct containing desc's DMA address and metadata 208 + * pointer, if it is present and valid (initialized to %NULL otherwise). 209 + */ 210 + static inline struct xdp_desc_ctx 211 + xsk_buff_raw_get_ctx(const struct xsk_buff_pool *pool, u64 addr) 212 + { 213 + return xp_raw_get_ctx(pool, addr); 214 + } 215 + 199 216 #define XDP_TXMD_FLAGS_VALID ( \ 200 217 XDP_TXMD_FLAGS_TIMESTAMP | \ 201 218 XDP_TXMD_FLAGS_CHECKSUM | \ ··· 224 207 return !(meta->flags & ~XDP_TXMD_FLAGS_VALID); 225 208 } 226 209 227 - static inline struct xsk_tx_metadata *xsk_buff_get_metadata(struct xsk_buff_pool *pool, u64 addr) 210 + static inline struct xsk_tx_metadata * 211 + __xsk_buff_get_metadata(const struct xsk_buff_pool *pool, void *data) 228 212 { 229 213 struct xsk_tx_metadata *meta; 230 214 231 215 if (!pool->tx_metadata_len) 232 216 return NULL; 233 217 234 - meta = xp_raw_get_data(pool, addr) - pool->tx_metadata_len; 218 + meta = data - pool->tx_metadata_len; 235 219 if (unlikely(!xsk_buff_valid_tx_metadata(meta))) 236 220 return NULL; /* no way to signal the error to the user */ 237 221 238 222 return meta; 223 + } 224 + 225 + static inline struct xsk_tx_metadata * 226 + xsk_buff_get_metadata(struct xsk_buff_pool *pool, u64 addr) 227 + { 228 + return __xsk_buff_get_metadata(pool, xp_raw_get_data(pool, addr)); 239 229 } 240 230 241 231 static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp) ··· 412 388 return NULL; 413 389 } 414 390 391 + static inline struct xdp_desc_ctx 392 + xsk_buff_raw_get_ctx(const struct xsk_buff_pool *pool, u64 addr) 393 + { 394 + return (struct xdp_desc_ctx){ }; 395 + } 396 + 415 397 static inline bool xsk_buff_valid_tx_metadata(struct xsk_tx_metadata *meta) 416 398 { 417 399 return false; 418 400 } 419 401 420 - static inline struct xsk_tx_metadata *xsk_buff_get_metadata(struct xsk_buff_pool *pool, u64 addr) 402 + static inline struct xsk_tx_metadata * 403 + __xsk_buff_get_metadata(const struct xsk_buff_pool *pool, void *data) 404 + { 405 + return NULL; 406 + } 407 + 408 + static inline struct xsk_tx_metadata * 409 + xsk_buff_get_metadata(struct xsk_buff_pool *pool, u64 addr) 421 410 { 422 411 return NULL; 423 412 }
+8
include/net/xsk_buff_pool.h
··· 141 141 bool xp_can_alloc(struct xsk_buff_pool *pool, u32 count); 142 142 void *xp_raw_get_data(struct xsk_buff_pool *pool, u64 addr); 143 143 dma_addr_t xp_raw_get_dma(struct xsk_buff_pool *pool, u64 addr); 144 + 145 + struct xdp_desc_ctx { 146 + dma_addr_t dma; 147 + struct xsk_tx_metadata *meta; 148 + }; 149 + 150 + struct xdp_desc_ctx xp_raw_get_ctx(const struct xsk_buff_pool *pool, u64 addr); 151 + 144 152 static inline dma_addr_t xp_get_dma(struct xdp_buff_xsk *xskb) 145 153 { 146 154 return xskb->dma;
+42 -4
net/xdp/xsk_buff_pool.c
··· 699 699 } 700 700 EXPORT_SYMBOL(xp_free); 701 701 702 + static u64 __xp_raw_get_addr(const struct xsk_buff_pool *pool, u64 addr) 703 + { 704 + return pool->unaligned ? xp_unaligned_add_offset_to_addr(addr) : addr; 705 + } 706 + 707 + static void *__xp_raw_get_data(const struct xsk_buff_pool *pool, u64 addr) 708 + { 709 + return pool->addrs + addr; 710 + } 711 + 702 712 void *xp_raw_get_data(struct xsk_buff_pool *pool, u64 addr) 703 713 { 704 - addr = pool->unaligned ? xp_unaligned_add_offset_to_addr(addr) : addr; 705 - return pool->addrs + addr; 714 + return __xp_raw_get_data(pool, __xp_raw_get_addr(pool, addr)); 706 715 } 707 716 EXPORT_SYMBOL(xp_raw_get_data); 708 717 709 - dma_addr_t xp_raw_get_dma(struct xsk_buff_pool *pool, u64 addr) 718 + static dma_addr_t __xp_raw_get_dma(const struct xsk_buff_pool *pool, u64 addr) 710 719 { 711 - addr = pool->unaligned ? xp_unaligned_add_offset_to_addr(addr) : addr; 712 720 return (pool->dma_pages[addr >> PAGE_SHIFT] & 713 721 ~XSK_NEXT_PG_CONTIG_MASK) + 714 722 (addr & ~PAGE_MASK); 715 723 } 724 + 725 + dma_addr_t xp_raw_get_dma(struct xsk_buff_pool *pool, u64 addr) 726 + { 727 + return __xp_raw_get_dma(pool, __xp_raw_get_addr(pool, addr)); 728 + } 716 729 EXPORT_SYMBOL(xp_raw_get_dma); 730 + 731 + /** 732 + * xp_raw_get_ctx - get &xdp_desc context 733 + * @pool: XSk buff pool desc address belongs to 734 + * @addr: desc address (from userspace) 735 + * 736 + * Helper for getting desc's DMA address and metadata pointer, if present. 737 + * Saves one call on hotpath, double calculation of the actual address, 738 + * and inline checks for metadata presence and sanity. 739 + * 740 + * Return: new &xdp_desc_ctx struct containing desc's DMA address and metadata 741 + * pointer, if it is present and valid (initialized to %NULL otherwise). 742 + */ 743 + struct xdp_desc_ctx xp_raw_get_ctx(const struct xsk_buff_pool *pool, u64 addr) 744 + { 745 + struct xdp_desc_ctx ret; 746 + 747 + addr = __xp_raw_get_addr(pool, addr); 748 + 749 + ret.dma = __xp_raw_get_dma(pool, addr); 750 + ret.meta = __xsk_buff_get_metadata(pool, __xp_raw_get_data(pool, addr)); 751 + 752 + return ret; 753 + } 754 + EXPORT_SYMBOL(xp_raw_get_ctx);