Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

libeth: add Rx buffer management

Add a couple intuitive helpers to hide Rx buffer implementation details
in the library and not multiplicate it between drivers. The settings are
sorta optimized for 100G+ NICs, but nothing really HW-specific here.
Use the new page_pool_dev_alloc() to dynamically switch between
split-page and full-page modes depending on MTU, page size, required
headroom etc. For example, on x86_64 with the default driver settings
each page is shared between 2 buffers. Turning on XDP (not in this
series) -> increasing headroom requirement pushes truesize out of 2048
boundary, leading to that each buffer starts getting a full page.
The "ceiling" limit is %PAGE_SIZE, as only order-0 pages are used to
avoid compound overhead. For the above architecture, this means maximum
linear frame size of 3712 w/o XDP.
Not that &libeth_buf_queue is not a complete queue/ring structure for
now, rather a shim, but eventually the libeth-enabled drivers will move
to it, with iavf being the first one.

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>

authored by

Alexander Lobakin and committed by
Tony Nguyen
e6c91556 ce230f4f

+216
+1
drivers/net/ethernet/intel/libeth/Kconfig
··· 3 3 4 4 config LIBETH 5 5 tristate 6 + select PAGE_POOL 6 7 help 7 8 libeth is a common library containing routines shared between several 8 9 drivers, but not yet promoted to the generic kernel API.
+98
drivers/net/ethernet/intel/libeth/rx.c
··· 3 3 4 4 #include <net/libeth/rx.h> 5 5 6 + /* Rx buffer management */ 7 + 8 + /** 9 + * libeth_rx_hw_len - get the actual buffer size to be passed to HW 10 + * @pp: &page_pool_params of the netdev to calculate the size for 11 + * @max_len: maximum buffer size for a single descriptor 12 + * 13 + * Return: HW-writeable length per one buffer to pass it to the HW accounting: 14 + * MTU the @dev has, HW required alignment, minimum and maximum allowed values, 15 + * and system's page size. 16 + */ 17 + static u32 libeth_rx_hw_len(const struct page_pool_params *pp, u32 max_len) 18 + { 19 + u32 len; 20 + 21 + len = READ_ONCE(pp->netdev->mtu) + LIBETH_RX_LL_LEN; 22 + len = ALIGN(len, LIBETH_RX_BUF_STRIDE); 23 + len = min3(len, ALIGN_DOWN(max_len ? : U32_MAX, LIBETH_RX_BUF_STRIDE), 24 + pp->max_len); 25 + 26 + return len; 27 + } 28 + 29 + /** 30 + * libeth_rx_fq_create - create a PP with the default libeth settings 31 + * @fq: buffer queue struct to fill 32 + * @napi: &napi_struct covering this PP (no usage outside its poll loops) 33 + * 34 + * Return: %0 on success, -%errno on failure. 35 + */ 36 + int libeth_rx_fq_create(struct libeth_fq *fq, struct napi_struct *napi) 37 + { 38 + struct page_pool_params pp = { 39 + .flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV, 40 + .order = LIBETH_RX_PAGE_ORDER, 41 + .pool_size = fq->count, 42 + .nid = fq->nid, 43 + .dev = napi->dev->dev.parent, 44 + .netdev = napi->dev, 45 + .napi = napi, 46 + .dma_dir = DMA_FROM_DEVICE, 47 + .offset = LIBETH_SKB_HEADROOM, 48 + }; 49 + struct libeth_fqe *fqes; 50 + struct page_pool *pool; 51 + 52 + /* HW-writeable / syncable length per one page */ 53 + pp.max_len = LIBETH_RX_PAGE_LEN(pp.offset); 54 + 55 + /* HW-writeable length per buffer */ 56 + fq->buf_len = libeth_rx_hw_len(&pp, fq->buf_len); 57 + /* Buffer size to allocate */ 58 + fq->truesize = roundup_pow_of_two(SKB_HEAD_ALIGN(pp.offset + 59 + fq->buf_len)); 60 + 61 + pool = page_pool_create(&pp); 62 + if (IS_ERR(pool)) 63 + return PTR_ERR(pool); 64 + 65 + fqes = kvcalloc_node(fq->count, sizeof(*fqes), GFP_KERNEL, fq->nid); 66 + if (!fqes) 67 + goto err_buf; 68 + 69 + fq->fqes = fqes; 70 + fq->pp = pool; 71 + 72 + return 0; 73 + 74 + err_buf: 75 + page_pool_destroy(pool); 76 + 77 + return -ENOMEM; 78 + } 79 + EXPORT_SYMBOL_NS_GPL(libeth_rx_fq_create, LIBETH); 80 + 81 + /** 82 + * libeth_rx_fq_destroy - destroy a &page_pool created by libeth 83 + * @fq: buffer queue to process 84 + */ 85 + void libeth_rx_fq_destroy(struct libeth_fq *fq) 86 + { 87 + kvfree(fq->fqes); 88 + page_pool_destroy(fq->pp); 89 + } 90 + EXPORT_SYMBOL_NS_GPL(libeth_rx_fq_destroy, LIBETH); 91 + 92 + /** 93 + * libeth_rx_recycle_slow - recycle a libeth page from the NAPI context 94 + * @page: page to recycle 95 + * 96 + * To be used on exceptions or rare cases not requiring fast inline recycling. 97 + */ 98 + void libeth_rx_recycle_slow(struct page *page) 99 + { 100 + page_pool_recycle_direct(page->pp, page); 101 + } 102 + EXPORT_SYMBOL_NS_GPL(libeth_rx_recycle_slow, LIBETH); 103 + 6 104 /* Converting abstract packet type numbers into a software structure with 7 105 * the packet parameters to do O(1) lookup on Rx. 8 106 */
+117
include/net/libeth/rx.h
··· 4 4 #ifndef __LIBETH_RX_H 5 5 #define __LIBETH_RX_H 6 6 7 + #include <linux/if_vlan.h> 8 + 9 + #include <net/page_pool/helpers.h> 7 10 #include <net/xdp.h> 11 + 12 + /* Rx buffer management */ 13 + 14 + /* Space reserved in front of each frame */ 15 + #define LIBETH_SKB_HEADROOM (NET_SKB_PAD + NET_IP_ALIGN) 16 + /* Maximum headroom for worst-case calculations */ 17 + #define LIBETH_MAX_HEADROOM LIBETH_SKB_HEADROOM 18 + /* Link layer / L2 overhead: Ethernet, 2 VLAN tags (C + S), FCS */ 19 + #define LIBETH_RX_LL_LEN (ETH_HLEN + 2 * VLAN_HLEN + ETH_FCS_LEN) 20 + 21 + /* Always use order-0 pages */ 22 + #define LIBETH_RX_PAGE_ORDER 0 23 + /* Pick a sane buffer stride and align to a cacheline boundary */ 24 + #define LIBETH_RX_BUF_STRIDE SKB_DATA_ALIGN(128) 25 + /* HW-writeable space in one buffer: truesize - headroom/tailroom, aligned */ 26 + #define LIBETH_RX_PAGE_LEN(hr) \ 27 + ALIGN_DOWN(SKB_MAX_ORDER(hr, LIBETH_RX_PAGE_ORDER), \ 28 + LIBETH_RX_BUF_STRIDE) 29 + 30 + /** 31 + * struct libeth_fqe - structure representing an Rx buffer (fill queue element) 32 + * @page: page holding the buffer 33 + * @offset: offset from the page start (to the headroom) 34 + * @truesize: total space occupied by the buffer (w/ headroom and tailroom) 35 + * 36 + * Depending on the MTU, API switches between one-page-per-frame and shared 37 + * page model (to conserve memory on bigger-page platforms). In case of the 38 + * former, @offset is always 0 and @truesize is always ```PAGE_SIZE```. 39 + */ 40 + struct libeth_fqe { 41 + struct page *page; 42 + u32 offset; 43 + u32 truesize; 44 + } __aligned_largest; 45 + 46 + /** 47 + * struct libeth_fq - structure representing a buffer (fill) queue 48 + * @fp: hotpath part of the structure 49 + * @pp: &page_pool for buffer management 50 + * @fqes: array of Rx buffers 51 + * @truesize: size to allocate per buffer, w/overhead 52 + * @count: number of descriptors/buffers the queue has 53 + * @buf_len: HW-writeable length per each buffer 54 + * @nid: ID of the closest NUMA node with memory 55 + */ 56 + struct libeth_fq { 57 + struct_group_tagged(libeth_fq_fp, fp, 58 + struct page_pool *pp; 59 + struct libeth_fqe *fqes; 60 + 61 + u32 truesize; 62 + u32 count; 63 + ); 64 + 65 + /* Cold fields */ 66 + u32 buf_len; 67 + int nid; 68 + }; 69 + 70 + int libeth_rx_fq_create(struct libeth_fq *fq, struct napi_struct *napi); 71 + void libeth_rx_fq_destroy(struct libeth_fq *fq); 72 + 73 + /** 74 + * libeth_rx_alloc - allocate a new Rx buffer 75 + * @fq: fill queue to allocate for 76 + * @i: index of the buffer within the queue 77 + * 78 + * Return: DMA address to be passed to HW for Rx on successful allocation, 79 + * ```DMA_MAPPING_ERROR``` otherwise. 80 + */ 81 + static inline dma_addr_t libeth_rx_alloc(const struct libeth_fq_fp *fq, u32 i) 82 + { 83 + struct libeth_fqe *buf = &fq->fqes[i]; 84 + 85 + buf->truesize = fq->truesize; 86 + buf->page = page_pool_dev_alloc(fq->pp, &buf->offset, &buf->truesize); 87 + if (unlikely(!buf->page)) 88 + return DMA_MAPPING_ERROR; 89 + 90 + return page_pool_get_dma_addr(buf->page) + buf->offset + 91 + fq->pp->p.offset; 92 + } 93 + 94 + void libeth_rx_recycle_slow(struct page *page); 95 + 96 + /** 97 + * libeth_rx_sync_for_cpu - synchronize or recycle buffer post DMA 98 + * @fqe: buffer to process 99 + * @len: frame length from the descriptor 100 + * 101 + * Process the buffer after it's written by HW. The regular path is to 102 + * synchronize DMA for CPU, but in case of no data it will be immediately 103 + * recycled back to its PP. 104 + * 105 + * Return: true when there's data to process, false otherwise. 106 + */ 107 + static inline bool libeth_rx_sync_for_cpu(const struct libeth_fqe *fqe, 108 + u32 len) 109 + { 110 + struct page *page = fqe->page; 111 + 112 + /* Very rare, but possible case. The most common reason: 113 + * the last fragment contained FCS only, which was then 114 + * stripped by the HW. 115 + */ 116 + if (unlikely(!len)) { 117 + libeth_rx_recycle_slow(page); 118 + return false; 119 + } 120 + 121 + page_pool_dma_sync_for_cpu(page->pp, page, fqe->offset, len); 122 + 123 + return true; 124 + } 8 125 9 126 /* Converting abstract packet type numbers into a software structure with 10 127 * the packet parameters to do O(1) lookup on Rx.