Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'add-frag-page-support-in-page-pool'

Yunsheng Lin says:

====================
add frag page support in page pool

This patchset adds frag page support in page pool and
enable skb's page frag recycling based on page pool in
hns3 drvier.
====================

Link: https://lore.kernel.org/r/1628217982-53533-1-git-send-email-linyunsheng@huawei.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

+258 -39
+1
drivers/net/ethernet/hisilicon/Kconfig
··· 91 91 tristate "Hisilicon Network Subsystem Support HNS3 (Framework)" 92 92 depends on PCI 93 93 select NET_DEVLINK 94 + select PAGE_POOL 94 95 help 95 96 This selects the framework support for Hisilicon Network Subsystem 3. 96 97 This layer facilitates clients like ENET, RoCE and user-space ethernet
+74 -5
drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
··· 3205 3205 unsigned int order = hns3_page_order(ring); 3206 3206 struct page *p; 3207 3207 3208 + if (ring->page_pool) { 3209 + p = page_pool_dev_alloc_frag(ring->page_pool, 3210 + &cb->page_offset, 3211 + hns3_buf_size(ring)); 3212 + if (unlikely(!p)) 3213 + return -ENOMEM; 3214 + 3215 + cb->priv = p; 3216 + cb->buf = page_address(p); 3217 + cb->dma = page_pool_get_dma_addr(p); 3218 + cb->type = DESC_TYPE_PP_FRAG; 3219 + cb->reuse_flag = 0; 3220 + return 0; 3221 + } 3222 + 3208 3223 p = dev_alloc_pages(order); 3209 3224 if (!p) 3210 3225 return -ENOMEM; ··· 3242 3227 if (cb->type & (DESC_TYPE_SKB | DESC_TYPE_BOUNCE_HEAD | 3243 3228 DESC_TYPE_BOUNCE_ALL | DESC_TYPE_SGL_SKB)) 3244 3229 napi_consume_skb(cb->priv, budget); 3245 - else if (!HNAE3_IS_TX_RING(ring) && cb->pagecnt_bias) 3246 - __page_frag_cache_drain(cb->priv, cb->pagecnt_bias); 3230 + else if (!HNAE3_IS_TX_RING(ring)) { 3231 + if (cb->type & DESC_TYPE_PAGE && cb->pagecnt_bias) 3232 + __page_frag_cache_drain(cb->priv, cb->pagecnt_bias); 3233 + else if (cb->type & DESC_TYPE_PP_FRAG) 3234 + page_pool_put_full_page(ring->page_pool, cb->priv, 3235 + false); 3236 + } 3247 3237 memset(cb, 0, sizeof(*cb)); 3248 3238 } 3249 3239 ··· 3335 3315 int ret; 3336 3316 3337 3317 ret = hns3_alloc_buffer(ring, cb); 3338 - if (ret) 3318 + if (ret || ring->page_pool) 3339 3319 goto out; 3340 3320 3341 3321 ret = hns3_map_buffer(ring, cb); ··· 3357 3337 if (ret) 3358 3338 return ret; 3359 3339 3360 - ring->desc[i].addr = cpu_to_le64(ring->desc_cb[i].dma); 3340 + ring->desc[i].addr = cpu_to_le64(ring->desc_cb[i].dma + 3341 + ring->desc_cb[i].page_offset); 3361 3342 3362 3343 return 0; 3363 3344 } ··· 3388 3367 { 3389 3368 hns3_unmap_buffer(ring, &ring->desc_cb[i]); 3390 3369 ring->desc_cb[i] = *res_cb; 3391 - ring->desc[i].addr = cpu_to_le64(ring->desc_cb[i].dma); 3370 + ring->desc[i].addr = cpu_to_le64(ring->desc_cb[i].dma + 3371 + ring->desc_cb[i].page_offset); 3392 3372 ring->desc[i].rx.bd_base_info = 0; 3393 3373 } 3394 3374 ··· 3560 3538 u32 truesize = hns3_buf_size(ring); 3561 3539 u32 frag_size = size - pull_len; 3562 3540 bool reused; 3541 + 3542 + if (ring->page_pool) { 3543 + skb_add_rx_frag(skb, i, desc_cb->priv, frag_offset, 3544 + frag_size, truesize); 3545 + return; 3546 + } 3563 3547 3564 3548 /* Avoid re-using remote or pfmem page */ 3565 3549 if (unlikely(!dev_page_is_reusable(desc_cb->priv))) ··· 3884 3856 /* We can reuse buffer as-is, just make sure it is reusable */ 3885 3857 if (dev_page_is_reusable(desc_cb->priv)) 3886 3858 desc_cb->reuse_flag = 1; 3859 + else if (desc_cb->type & DESC_TYPE_PP_FRAG) 3860 + page_pool_put_full_page(ring->page_pool, desc_cb->priv, 3861 + false); 3887 3862 else /* This page cannot be reused so discard it */ 3888 3863 __page_frag_cache_drain(desc_cb->priv, 3889 3864 desc_cb->pagecnt_bias); ··· 3894 3863 hns3_rx_ring_move_fw(ring); 3895 3864 return 0; 3896 3865 } 3866 + 3867 + if (ring->page_pool) 3868 + skb_mark_for_recycle(skb); 3869 + 3897 3870 u64_stats_update_begin(&ring->syncp); 3898 3871 ring->stats.seg_pkt_cnt++; 3899 3872 u64_stats_update_end(&ring->syncp); ··· 3936 3901 "alloc rx fraglist skb fail\n"); 3937 3902 return -ENXIO; 3938 3903 } 3904 + 3905 + if (ring->page_pool) 3906 + skb_mark_for_recycle(new_skb); 3907 + 3939 3908 ring->frag_num = 0; 3940 3909 3941 3910 if (ring->tail_skb) { ··· 4744 4705 priv->ring = NULL; 4745 4706 } 4746 4707 4708 + static void hns3_alloc_page_pool(struct hns3_enet_ring *ring) 4709 + { 4710 + struct page_pool_params pp_params = { 4711 + .flags = PP_FLAG_DMA_MAP | PP_FLAG_PAGE_FRAG | 4712 + PP_FLAG_DMA_SYNC_DEV, 4713 + .order = hns3_page_order(ring), 4714 + .pool_size = ring->desc_num * hns3_buf_size(ring) / 4715 + (PAGE_SIZE << hns3_page_order(ring)), 4716 + .nid = dev_to_node(ring_to_dev(ring)), 4717 + .dev = ring_to_dev(ring), 4718 + .dma_dir = DMA_FROM_DEVICE, 4719 + .offset = 0, 4720 + .max_len = PAGE_SIZE << hns3_page_order(ring), 4721 + }; 4722 + 4723 + ring->page_pool = page_pool_create(&pp_params); 4724 + if (IS_ERR(ring->page_pool)) { 4725 + dev_warn(ring_to_dev(ring), "page pool creation failed: %ld\n", 4726 + PTR_ERR(ring->page_pool)); 4727 + ring->page_pool = NULL; 4728 + } 4729 + } 4730 + 4747 4731 static int hns3_alloc_ring_memory(struct hns3_enet_ring *ring) 4748 4732 { 4749 4733 int ret; ··· 4786 4724 goto out_with_desc_cb; 4787 4725 4788 4726 if (!HNAE3_IS_TX_RING(ring)) { 4727 + hns3_alloc_page_pool(ring); 4728 + 4789 4729 ret = hns3_alloc_ring_buffers(ring); 4790 4730 if (ret) 4791 4731 goto out_with_desc; ··· 4827 4763 get_order(tx_spare->len)); 4828 4764 devm_kfree(ring_to_dev(ring), tx_spare); 4829 4765 ring->tx_spare = NULL; 4766 + } 4767 + 4768 + if (!HNAE3_IS_TX_RING(ring) && ring->page_pool) { 4769 + page_pool_destroy(ring->page_pool); 4770 + ring->page_pool = NULL; 4830 4771 } 4831 4772 } 4832 4773
+3
drivers/net/ethernet/hisilicon/hns3/hns3_enet.h
··· 6 6 7 7 #include <linux/dim.h> 8 8 #include <linux/if_vlan.h> 9 + #include <net/page_pool.h> 9 10 10 11 #include "hnae3.h" 11 12 ··· 308 307 DESC_TYPE_BOUNCE_ALL = 1 << 3, 309 308 DESC_TYPE_BOUNCE_HEAD = 1 << 4, 310 309 DESC_TYPE_SGL_SKB = 1 << 5, 310 + DESC_TYPE_PP_FRAG = 1 << 6, 311 311 }; 312 312 313 313 struct hns3_desc_cb { ··· 453 451 struct hnae3_queue *tqp; 454 452 int queue_index; 455 453 struct device *dev; /* will be used for DMA mapping of descriptors */ 454 + struct page_pool *page_pool; 456 455 457 456 /* statistic */ 458 457 struct ring_stats stats;
+1 -5
drivers/net/ethernet/marvell/mvneta.c
··· 2327 2327 if (!skb) 2328 2328 return ERR_PTR(-ENOMEM); 2329 2329 2330 - skb_mark_for_recycle(skb, virt_to_page(xdp->data), pool); 2330 + skb_mark_for_recycle(skb); 2331 2331 2332 2332 skb_reserve(skb, xdp->data - xdp->data_hard_start); 2333 2333 skb_put(skb, xdp->data_end - xdp->data); ··· 2339 2339 skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, 2340 2340 skb_frag_page(frag), skb_frag_off(frag), 2341 2341 skb_frag_size(frag), PAGE_SIZE); 2342 - /* We don't need to reset pp_recycle here. It's already set, so 2343 - * just mark fragments for recycling. 2344 - */ 2345 - page_pool_store_mem_info(skb_frag_page(frag), pool); 2346 2342 } 2347 2343 2348 2344 return skb;
+1 -1
drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
··· 3995 3995 } 3996 3996 3997 3997 if (pp) 3998 - skb_mark_for_recycle(skb, page, pp); 3998 + skb_mark_for_recycle(skb); 3999 3999 else 4000 4000 dma_unmap_single_attrs(dev->dev.parent, dma_addr, 4001 4001 bm_pool->buf_size, DMA_FROM_DEVICE,
+1 -1
drivers/net/ethernet/ti/cpsw.c
··· 431 431 skb->protocol = eth_type_trans(skb, ndev); 432 432 433 433 /* mark skb for recycling */ 434 - skb_mark_for_recycle(skb, page, pool); 434 + skb_mark_for_recycle(skb); 435 435 netif_receive_skb(skb); 436 436 437 437 ndev->stats.rx_bytes += len;
+1 -1
drivers/net/ethernet/ti/cpsw_new.c
··· 375 375 skb->protocol = eth_type_trans(skb, ndev); 376 376 377 377 /* mark skb for recycling */ 378 - skb_mark_for_recycle(skb, page, pool); 378 + skb_mark_for_recycle(skb); 379 379 netif_receive_skb(skb); 380 380 381 381 ndev->stats.rx_bytes += len;
+13 -5
include/linux/mm_types.h
··· 103 103 unsigned long pp_magic; 104 104 struct page_pool *pp; 105 105 unsigned long _pp_mapping_pad; 106 - /** 107 - * @dma_addr: might require a 64-bit value on 108 - * 32-bit architectures. 109 - */ 110 - unsigned long dma_addr[2]; 106 + unsigned long dma_addr; 107 + union { 108 + /** 109 + * dma_addr_upper: might require a 64-bit 110 + * value on 32-bit architectures. 111 + */ 112 + unsigned long dma_addr_upper; 113 + /** 114 + * For frag page support, not supported in 115 + * 32-bit architectures with 64-bit DMA. 116 + */ 117 + atomic_long_t pp_frag_count; 118 + }; 111 119 }; 112 120 struct { /* slab, slob and slub */ 113 121 union {
+1 -3
include/linux/skbuff.h
··· 4712 4712 } 4713 4713 4714 4714 #ifdef CONFIG_PAGE_POOL 4715 - static inline void skb_mark_for_recycle(struct sk_buff *skb, struct page *page, 4716 - struct page_pool *pp) 4715 + static inline void skb_mark_for_recycle(struct sk_buff *skb) 4717 4716 { 4718 4717 skb->pp_recycle = 1; 4719 - page_pool_store_mem_info(page, pp); 4720 4718 } 4721 4719 #endif 4722 4720
+54 -14
include/net/page_pool.h
··· 45 45 * Please note DMA-sync-for-CPU is still 46 46 * device driver responsibility 47 47 */ 48 - #define PP_FLAG_ALL (PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV) 48 + #define PP_FLAG_PAGE_FRAG BIT(2) /* for page frag feature */ 49 + #define PP_FLAG_ALL (PP_FLAG_DMA_MAP |\ 50 + PP_FLAG_DMA_SYNC_DEV |\ 51 + PP_FLAG_PAGE_FRAG) 49 52 50 53 /* 51 54 * Fast allocation side cache array/stack ··· 91 88 unsigned long defer_warn; 92 89 93 90 u32 pages_state_hold_cnt; 91 + unsigned int frag_offset; 92 + struct page *frag_page; 93 + long frag_users; 94 94 95 95 /* 96 96 * Data structure for allocation side ··· 141 135 gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN); 142 136 143 137 return page_pool_alloc_pages(pool, gfp); 138 + } 139 + 140 + struct page *page_pool_alloc_frag(struct page_pool *pool, unsigned int *offset, 141 + unsigned int size, gfp_t gfp); 142 + 143 + static inline struct page *page_pool_dev_alloc_frag(struct page_pool *pool, 144 + unsigned int *offset, 145 + unsigned int size) 146 + { 147 + gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN); 148 + 149 + return page_pool_alloc_frag(pool, offset, size, gfp); 144 150 } 145 151 146 152 /* get the stored dma direction. A driver might decide to treat this locally and ··· 216 198 page_pool_put_full_page(pool, page, true); 217 199 } 218 200 201 + #define PAGE_POOL_DMA_USE_PP_FRAG_COUNT \ 202 + (sizeof(dma_addr_t) > sizeof(unsigned long)) 203 + 219 204 static inline dma_addr_t page_pool_get_dma_addr(struct page *page) 220 205 { 221 - dma_addr_t ret = page->dma_addr[0]; 222 - if (sizeof(dma_addr_t) > sizeof(unsigned long)) 223 - ret |= (dma_addr_t)page->dma_addr[1] << 16 << 16; 206 + dma_addr_t ret = page->dma_addr; 207 + 208 + if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT) 209 + ret |= (dma_addr_t)page->dma_addr_upper << 16 << 16; 210 + 224 211 return ret; 225 212 } 226 213 227 214 static inline void page_pool_set_dma_addr(struct page *page, dma_addr_t addr) 228 215 { 229 - page->dma_addr[0] = addr; 230 - if (sizeof(dma_addr_t) > sizeof(unsigned long)) 231 - page->dma_addr[1] = upper_32_bits(addr); 216 + page->dma_addr = addr; 217 + if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT) 218 + page->dma_addr_upper = upper_32_bits(addr); 219 + } 220 + 221 + static inline void page_pool_set_frag_count(struct page *page, long nr) 222 + { 223 + atomic_long_set(&page->pp_frag_count, nr); 224 + } 225 + 226 + static inline long page_pool_atomic_sub_frag_count_return(struct page *page, 227 + long nr) 228 + { 229 + long ret; 230 + 231 + /* As suggested by Alexander, atomic_long_read() may cover up the 232 + * reference count errors, so avoid calling atomic_long_read() in 233 + * the cases of freeing or draining the page_frags, where we would 234 + * not expect it to match or that are slowpath anyway. 235 + */ 236 + if (__builtin_constant_p(nr) && 237 + atomic_long_read(&page->pp_frag_count) == nr) 238 + return 0; 239 + 240 + ret = atomic_long_sub_return(nr, &page->pp_frag_count); 241 + WARN_ON(ret < 0); 242 + return ret; 232 243 } 233 244 234 245 static inline bool is_page_pool_compiled_in(void) ··· 298 251 spin_unlock(&pool->ring.producer_lock); 299 252 else 300 253 spin_unlock_bh(&pool->ring.producer_lock); 301 - } 302 - 303 - /* Store mem_info on struct page and use it while recycling skb frags */ 304 - static inline 305 - void page_pool_store_mem_info(struct page *page, struct page_pool *pp) 306 - { 307 - page->pp = pp; 308 254 } 309 255 310 256 #endif /* _NET_PAGE_POOL_H */
+108 -4
net/core/page_pool.c
··· 24 24 #define DEFER_TIME (msecs_to_jiffies(1000)) 25 25 #define DEFER_WARN_INTERVAL (60 * HZ) 26 26 27 + #define BIAS_MAX LONG_MAX 28 + 27 29 static int page_pool_init(struct page_pool *pool, 28 30 const struct page_pool_params *params) 29 31 { ··· 68 66 * offset used by the DMA engine to start copying rx data 69 67 */ 70 68 } 69 + 70 + if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT && 71 + pool->p.flags & PP_FLAG_PAGE_FRAG) 72 + return -EINVAL; 71 73 72 74 if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) 73 75 return -ENOMEM; ··· 212 206 return true; 213 207 } 214 208 209 + static void page_pool_set_pp_info(struct page_pool *pool, 210 + struct page *page) 211 + { 212 + page->pp = pool; 213 + page->pp_magic |= PP_SIGNATURE; 214 + } 215 + 216 + static void page_pool_clear_pp_info(struct page *page) 217 + { 218 + page->pp_magic = 0; 219 + page->pp = NULL; 220 + } 221 + 215 222 static struct page *__page_pool_alloc_page_order(struct page_pool *pool, 216 223 gfp_t gfp) 217 224 { ··· 241 222 return NULL; 242 223 } 243 224 244 - page->pp_magic |= PP_SIGNATURE; 225 + page_pool_set_pp_info(pool, page); 245 226 246 227 /* Track how many pages are held 'in-flight' */ 247 228 pool->pages_state_hold_cnt++; ··· 285 266 put_page(page); 286 267 continue; 287 268 } 288 - page->pp_magic |= PP_SIGNATURE; 269 + 270 + page_pool_set_pp_info(pool, page); 289 271 pool->alloc.cache[pool->alloc.count++] = page; 290 272 /* Track how many pages are held 'in-flight' */ 291 273 pool->pages_state_hold_cnt++; ··· 365 345 DMA_ATTR_SKIP_CPU_SYNC); 366 346 page_pool_set_dma_addr(page, 0); 367 347 skip_dma_unmap: 368 - page->pp_magic = 0; 348 + page_pool_clear_pp_info(page); 369 349 370 350 /* This may be the last page returned, releasing the pool, so 371 351 * it is not safe to reference pool afterwards. ··· 425 405 __page_pool_put_page(struct page_pool *pool, struct page *page, 426 406 unsigned int dma_sync_size, bool allow_direct) 427 407 { 408 + /* It is not the last user for the page frag case */ 409 + if (pool->p.flags & PP_FLAG_PAGE_FRAG && 410 + page_pool_atomic_sub_frag_count_return(page, 1)) 411 + return NULL; 412 + 428 413 /* This allocator is optimized for the XDP mode that uses 429 414 * one-frame-per-page, but have fallbacks that act like the 430 415 * regular page allocator APIs. ··· 521 496 page_pool_return_page(pool, data[i]); 522 497 } 523 498 EXPORT_SYMBOL(page_pool_put_page_bulk); 499 + 500 + static struct page *page_pool_drain_frag(struct page_pool *pool, 501 + struct page *page) 502 + { 503 + long drain_count = BIAS_MAX - pool->frag_users; 504 + 505 + /* Some user is still using the page frag */ 506 + if (likely(page_pool_atomic_sub_frag_count_return(page, 507 + drain_count))) 508 + return NULL; 509 + 510 + if (page_ref_count(page) == 1 && !page_is_pfmemalloc(page)) { 511 + if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) 512 + page_pool_dma_sync_for_device(pool, page, -1); 513 + 514 + return page; 515 + } 516 + 517 + page_pool_return_page(pool, page); 518 + return NULL; 519 + } 520 + 521 + static void page_pool_free_frag(struct page_pool *pool) 522 + { 523 + long drain_count = BIAS_MAX - pool->frag_users; 524 + struct page *page = pool->frag_page; 525 + 526 + pool->frag_page = NULL; 527 + 528 + if (!page || 529 + page_pool_atomic_sub_frag_count_return(page, drain_count)) 530 + return; 531 + 532 + page_pool_return_page(pool, page); 533 + } 534 + 535 + struct page *page_pool_alloc_frag(struct page_pool *pool, 536 + unsigned int *offset, 537 + unsigned int size, gfp_t gfp) 538 + { 539 + unsigned int max_size = PAGE_SIZE << pool->p.order; 540 + struct page *page = pool->frag_page; 541 + 542 + if (WARN_ON(!(pool->p.flags & PP_FLAG_PAGE_FRAG) || 543 + size > max_size)) 544 + return NULL; 545 + 546 + size = ALIGN(size, dma_get_cache_alignment()); 547 + *offset = pool->frag_offset; 548 + 549 + if (page && *offset + size > max_size) { 550 + page = page_pool_drain_frag(pool, page); 551 + if (page) 552 + goto frag_reset; 553 + } 554 + 555 + if (!page) { 556 + page = page_pool_alloc_pages(pool, gfp); 557 + if (unlikely(!page)) { 558 + pool->frag_page = NULL; 559 + return NULL; 560 + } 561 + 562 + pool->frag_page = page; 563 + 564 + frag_reset: 565 + pool->frag_users = 1; 566 + *offset = 0; 567 + pool->frag_offset = size; 568 + page_pool_set_frag_count(page, BIAS_MAX); 569 + return page; 570 + } 571 + 572 + pool->frag_users++; 573 + pool->frag_offset = *offset + size; 574 + return page; 575 + } 576 + EXPORT_SYMBOL(page_pool_alloc_frag); 524 577 525 578 static void page_pool_empty_ring(struct page_pool *pool) 526 579 { ··· 705 602 if (!page_pool_put(pool)) 706 603 return; 707 604 605 + page_pool_free_frag(pool); 606 + 708 607 if (!page_pool_release(pool)) 709 608 return; 710 609 ··· 749 644 * The page will be returned to the pool here regardless of the 750 645 * 'flipped' fragment being in use or not. 751 646 */ 752 - page->pp = NULL; 753 647 page_pool_put_full_page(pp, page, false); 754 648 755 649 return true;