Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

libeth: xdp, xsk: access adjacent u32s as u64 where applicable

On 64-bit systems, writing/reading one u64 is faster than two u32s even
when they're are adjacent in a struct. The compilers won't guarantee
they will combine those; I observed both successful and unsuccessful
attempts with both GCC and Clang, and it's not easy to say what it
depends on.
There's a few places in libeth_xdp winning up to several percent from
combined access (both performance and object code size, especially
when unrolling). Add __LIBETH_WORD_ACCESS and use it there on LE.
Drivers are free to optimize HW-specific callbacks under the same
definition.

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>

authored by

Alexander Lobakin and committed by
Tony Nguyen
80bae9df 3ced71a8

+31 -8
+26 -3
include/net/libeth/xdp.h
··· 475 475 ((const void *)(uintptr_t)(priv)); \ 476 476 }) 477 477 478 + /* 479 + * On 64-bit systems, assigning one u64 is faster than two u32s. When ::len 480 + * occupies lowest 32 bits (LE), whole ::opts can be assigned directly instead. 481 + */ 482 + #ifdef __LITTLE_ENDIAN 483 + #define __LIBETH_WORD_ACCESS 1 484 + #endif 485 + #ifdef __LIBETH_WORD_ACCESS 486 + #define __libeth_xdp_tx_len(flen, ...) \ 487 + .opts = ((flen) | FIELD_PREP(GENMASK_ULL(63, 32), (__VA_ARGS__ + 0))) 488 + #else 489 + #define __libeth_xdp_tx_len(flen, ...) \ 490 + .len = (flen), .flags = (__VA_ARGS__ + 0) 491 + #endif 492 + 478 493 /** 479 494 * libeth_xdp_tx_xmit_bulk - main XDP Tx function 480 495 * @bulk: array of frames to send ··· 885 870 886 871 bq->bulk[bq->count++] = (typeof(*bq->bulk)){ 887 872 .xdpf = xdpf, 888 - .len = xdpf->len, 889 - .flags = LIBETH_XDP_TX_FIRST, 873 + __libeth_xdp_tx_len(xdpf->len, LIBETH_XDP_TX_FIRST), 890 874 }; 891 875 892 876 if (!xdp_frame_has_frags(xdpf)) ··· 916 902 917 903 bq->bulk[bq->count++] = (typeof(*bq->bulk)){ 918 904 .dma = dma, 919 - .len = skb_frag_size(frag), 905 + __libeth_xdp_tx_len(skb_frag_size(frag)), 920 906 }; 921 907 922 908 return true; ··· 1274 1260 * Internal, use libeth_xdp_process_buff() instead. Initializes XDP buffer 1275 1261 * head with the Rx buffer data: data pointer, length, headroom, and 1276 1262 * truesize/tailroom. Zeroes the flags. 1263 + * Uses faster single u64 write instead of per-field access. 1277 1264 */ 1278 1265 static inline void libeth_xdp_prepare_buff(struct libeth_xdp_buff *xdp, 1279 1266 const struct libeth_fqe *fqe, ··· 1282 1267 { 1283 1268 const struct page *page = __netmem_to_page(fqe->netmem); 1284 1269 1270 + #ifdef __LIBETH_WORD_ACCESS 1271 + static_assert(offsetofend(typeof(xdp->base), flags) - 1272 + offsetof(typeof(xdp->base), frame_sz) == 1273 + sizeof(u64)); 1274 + 1275 + *(u64 *)&xdp->base.frame_sz = fqe->truesize; 1276 + #else 1285 1277 xdp_init_buff(&xdp->base, fqe->truesize, xdp->base.rxq); 1278 + #endif 1286 1279 xdp_prepare_buff(&xdp->base, page_address(page) + fqe->offset, 1287 1280 page->pp->p.offset, len, true); 1288 1281 }
+5 -5
include/net/libeth/xsk.h
··· 26 26 { 27 27 bq->bulk[bq->count++] = (typeof(*bq->bulk)){ 28 28 .xsk = xdp, 29 - .len = xdp->base.data_end - xdp->data, 30 - .flags = LIBETH_XDP_TX_FIRST, 29 + __libeth_xdp_tx_len(xdp->base.data_end - xdp->data, 30 + LIBETH_XDP_TX_FIRST), 31 31 }; 32 32 33 33 if (likely(!xdp_buff_has_frags(&xdp->base))) ··· 48 48 { 49 49 bq->bulk[bq->count++] = (typeof(*bq->bulk)){ 50 50 .xsk = frag, 51 - .len = frag->base.data_end - frag->data, 51 + __libeth_xdp_tx_len(frag->base.data_end - frag->data), 52 52 }; 53 53 } 54 54 ··· 199 199 ctx = xsk_buff_raw_get_ctx(sq->pool, xdesc->addr); 200 200 desc = (typeof(desc)){ 201 201 .addr = ctx.dma, 202 - .len = xdesc->len, 202 + __libeth_xdp_tx_len(xdesc->len), 203 203 }; 204 204 205 205 BUILD_BUG_ON(!__builtin_constant_p(tmo == libeth_xsktmo)); ··· 226 226 { 227 227 return (struct libeth_xdp_tx_desc){ 228 228 .addr = xsk_buff_raw_get_dma(sq->pool, xdesc->addr), 229 - .len = xdesc->len, 229 + __libeth_xdp_tx_len(xdesc->len), 230 230 }; 231 231 } 232 232