Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

page_pool: devmem support

Convert netmem to be a union of struct page and struct netmem. Overload
the LSB of struct netmem* to indicate that it's a net_iov, otherwise
it's a page.

Currently these entries in struct page are rented by the page_pool and
used exclusively by the net stack:

struct {
unsigned long pp_magic;
struct page_pool *pp;
unsigned long _pp_mapping_pad;
unsigned long dma_addr;
atomic_long_t pp_ref_count;
};

Mirror these (and only these) entries into struct net_iov and implement
netmem helpers that can access these common fields regardless of
whether the underlying type is page or net_iov.

Implement checks for net_iov in netmem helpers which delegate to mm
APIs, to ensure net_iov are never passed to the mm stack.

Signed-off-by: Mina Almasry <almasrymina@google.com>
Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
Acked-by: Jakub Kicinski <kuba@kernel.org>
Link: https://patch.msgid.link/20240910171458.219195-6-almasrymina@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

authored by

Mina Almasry and committed by
Jakub Kicinski
8ab79ed5 28c5c74e

+218 -69
+117 -7
include/net/netmem.h
··· 8 8 #ifndef _NET_NETMEM_H 9 9 #define _NET_NETMEM_H 10 10 11 + #include <linux/mm.h> 12 + #include <net/net_debug.h> 13 + 11 14 /* net_iov */ 12 15 16 + DECLARE_STATIC_KEY_FALSE(page_pool_mem_providers); 17 + 18 + /* We overload the LSB of the struct page pointer to indicate whether it's 19 + * a page or net_iov. 20 + */ 21 + #define NET_IOV 0x01UL 22 + 13 23 struct net_iov { 24 + unsigned long __unused_padding; 25 + unsigned long pp_magic; 26 + struct page_pool *pp; 14 27 struct dmabuf_genpool_chunk_owner *owner; 28 + unsigned long dma_addr; 29 + atomic_long_t pp_ref_count; 15 30 }; 31 + 32 + /* These fields in struct page are used by the page_pool and net stack: 33 + * 34 + * struct { 35 + * unsigned long pp_magic; 36 + * struct page_pool *pp; 37 + * unsigned long _pp_mapping_pad; 38 + * unsigned long dma_addr; 39 + * atomic_long_t pp_ref_count; 40 + * }; 41 + * 42 + * We mirror the page_pool fields here so the page_pool can access these fields 43 + * without worrying whether the underlying fields belong to a page or net_iov. 44 + * 45 + * The non-net stack fields of struct page are private to the mm stack and must 46 + * never be mirrored to net_iov. 47 + */ 48 + #define NET_IOV_ASSERT_OFFSET(pg, iov) \ 49 + static_assert(offsetof(struct page, pg) == \ 50 + offsetof(struct net_iov, iov)) 51 + NET_IOV_ASSERT_OFFSET(pp_magic, pp_magic); 52 + NET_IOV_ASSERT_OFFSET(pp, pp); 53 + NET_IOV_ASSERT_OFFSET(dma_addr, dma_addr); 54 + NET_IOV_ASSERT_OFFSET(pp_ref_count, pp_ref_count); 55 + #undef NET_IOV_ASSERT_OFFSET 16 56 17 57 /* netmem */ 18 58 ··· 67 27 */ 68 28 typedef unsigned long __bitwise netmem_ref; 69 29 30 + static inline bool netmem_is_net_iov(const netmem_ref netmem) 31 + { 32 + return (__force unsigned long)netmem & NET_IOV; 33 + } 34 + 70 35 /* This conversion fails (returns NULL) if the netmem_ref is not struct page 71 36 * backed. 72 - * 73 - * Currently struct page is the only possible netmem, and this helper never 74 - * fails. 75 37 */ 76 38 static inline struct page *netmem_to_page(netmem_ref netmem) 77 39 { 40 + if (WARN_ON_ONCE(netmem_is_net_iov(netmem))) 41 + return NULL; 42 + 78 43 return (__force struct page *)netmem; 79 44 } 80 45 81 - /* Converting from page to netmem is always safe, because a page can always be 82 - * a netmem. 83 - */ 46 + static inline struct net_iov *netmem_to_net_iov(netmem_ref netmem) 47 + { 48 + if (netmem_is_net_iov(netmem)) 49 + return (struct net_iov *)((__force unsigned long)netmem & 50 + ~NET_IOV); 51 + 52 + DEBUG_NET_WARN_ON_ONCE(true); 53 + return NULL; 54 + } 55 + 56 + static inline netmem_ref net_iov_to_netmem(struct net_iov *niov) 57 + { 58 + return (__force netmem_ref)((unsigned long)niov | NET_IOV); 59 + } 60 + 84 61 static inline netmem_ref page_to_netmem(struct page *page) 85 62 { 86 63 return (__force netmem_ref)page; ··· 105 48 106 49 static inline int netmem_ref_count(netmem_ref netmem) 107 50 { 51 + /* The non-pp refcount of net_iov is always 1. On net_iov, we only 52 + * support pp refcounting which uses the pp_ref_count field. 53 + */ 54 + if (netmem_is_net_iov(netmem)) 55 + return 1; 56 + 108 57 return page_ref_count(netmem_to_page(netmem)); 109 58 } 110 59 111 - static inline unsigned long netmem_to_pfn(netmem_ref netmem) 60 + static inline unsigned long netmem_pfn_trace(netmem_ref netmem) 112 61 { 62 + if (netmem_is_net_iov(netmem)) 63 + return 0; 64 + 113 65 return page_to_pfn(netmem_to_page(netmem)); 66 + } 67 + 68 + static inline struct net_iov *__netmem_clear_lsb(netmem_ref netmem) 69 + { 70 + return (struct net_iov *)((__force unsigned long)netmem & ~NET_IOV); 71 + } 72 + 73 + static inline struct page_pool *netmem_get_pp(netmem_ref netmem) 74 + { 75 + return __netmem_clear_lsb(netmem)->pp; 76 + } 77 + 78 + static inline atomic_long_t *netmem_get_pp_ref_count_ref(netmem_ref netmem) 79 + { 80 + return &__netmem_clear_lsb(netmem)->pp_ref_count; 81 + } 82 + 83 + static inline bool netmem_is_pref_nid(netmem_ref netmem, int pref_nid) 84 + { 85 + /* NUMA node preference only makes sense if we're allocating 86 + * system memory. Memory providers (which give us net_iovs) 87 + * choose for us. 88 + */ 89 + if (netmem_is_net_iov(netmem)) 90 + return true; 91 + 92 + return page_to_nid(netmem_to_page(netmem)) == pref_nid; 114 93 } 115 94 116 95 static inline netmem_ref netmem_compound_head(netmem_ref netmem) 117 96 { 97 + /* niov are never compounded */ 98 + if (netmem_is_net_iov(netmem)) 99 + return netmem; 100 + 118 101 return page_to_netmem(compound_head(netmem_to_page(netmem))); 102 + } 103 + 104 + static inline void *netmem_address(netmem_ref netmem) 105 + { 106 + if (netmem_is_net_iov(netmem)) 107 + return NULL; 108 + 109 + return page_address(netmem_to_page(netmem)); 110 + } 111 + 112 + static inline unsigned long netmem_get_dma_addr(netmem_ref netmem) 113 + { 114 + return __netmem_clear_lsb(netmem)->dma_addr; 119 115 } 120 116 121 117 #endif /* _NET_NETMEM_H */
+7 -32
include/net/page_pool/helpers.h
··· 216 216 217 217 static inline void page_pool_fragment_netmem(netmem_ref netmem, long nr) 218 218 { 219 - atomic_long_set(&netmem_to_page(netmem)->pp_ref_count, nr); 219 + atomic_long_set(netmem_get_pp_ref_count_ref(netmem), nr); 220 220 } 221 221 222 222 /** ··· 244 244 245 245 static inline long page_pool_unref_netmem(netmem_ref netmem, long nr) 246 246 { 247 - struct page *page = netmem_to_page(netmem); 247 + atomic_long_t *pp_ref_count = netmem_get_pp_ref_count_ref(netmem); 248 248 long ret; 249 249 250 250 /* If nr == pp_ref_count then we have cleared all remaining ··· 261 261 * initially, and only overwrite it when the page is partitioned into 262 262 * more than one piece. 263 263 */ 264 - if (atomic_long_read(&page->pp_ref_count) == nr) { 264 + if (atomic_long_read(pp_ref_count) == nr) { 265 265 /* As we have ensured nr is always one for constant case using 266 266 * the BUILD_BUG_ON(), only need to handle the non-constant case 267 267 * here for pp_ref_count draining, which is a rare case. 268 268 */ 269 269 BUILD_BUG_ON(__builtin_constant_p(nr) && nr != 1); 270 270 if (!__builtin_constant_p(nr)) 271 - atomic_long_set(&page->pp_ref_count, 1); 271 + atomic_long_set(pp_ref_count, 1); 272 272 273 273 return 0; 274 274 } 275 275 276 - ret = atomic_long_sub_return(nr, &page->pp_ref_count); 276 + ret = atomic_long_sub_return(nr, pp_ref_count); 277 277 WARN_ON(ret < 0); 278 278 279 279 /* We are the last user here too, reset pp_ref_count back to 1 to ··· 282 282 * page_pool_unref_page() currently. 283 283 */ 284 284 if (unlikely(!ret)) 285 - atomic_long_set(&page->pp_ref_count, 1); 285 + atomic_long_set(pp_ref_count, 1); 286 286 287 287 return ret; 288 288 } ··· 401 401 402 402 static inline dma_addr_t page_pool_get_dma_addr_netmem(netmem_ref netmem) 403 403 { 404 - struct page *page = netmem_to_page(netmem); 405 - 406 - dma_addr_t ret = page->dma_addr; 404 + dma_addr_t ret = netmem_get_dma_addr(netmem); 407 405 408 406 if (PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA) 409 407 ret <<= PAGE_SHIFT; ··· 419 421 static inline dma_addr_t page_pool_get_dma_addr(const struct page *page) 420 422 { 421 423 return page_pool_get_dma_addr_netmem(page_to_netmem((struct page *)page)); 422 - } 423 - 424 - static inline bool page_pool_set_dma_addr_netmem(netmem_ref netmem, 425 - dma_addr_t addr) 426 - { 427 - struct page *page = netmem_to_page(netmem); 428 - 429 - if (PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA) { 430 - page->dma_addr = addr >> PAGE_SHIFT; 431 - 432 - /* We assume page alignment to shave off bottom bits, 433 - * if this "compression" doesn't work we need to drop. 434 - */ 435 - return addr != (dma_addr_t)page->dma_addr << PAGE_SHIFT; 436 - } 437 - 438 - page->dma_addr = addr; 439 - return false; 440 424 } 441 425 442 426 /** ··· 441 461 page_pool_get_dma_addr(page), 442 462 offset + pool->p.offset, dma_sync_size, 443 463 page_pool_get_dma_dir(pool)); 444 - } 445 - 446 - static inline bool page_pool_set_dma_addr(struct page *page, dma_addr_t addr) 447 - { 448 - return page_pool_set_dma_addr_netmem(page_to_netmem(page), addr); 449 464 } 450 465 451 466 static inline bool page_pool_put(struct page_pool *pool)
+6 -6
include/trace/events/page_pool.h
··· 57 57 __entry->pool = pool; 58 58 __entry->netmem = (__force unsigned long)netmem; 59 59 __entry->release = release; 60 - __entry->pfn = netmem_to_pfn(netmem); 60 + __entry->pfn = netmem_pfn_trace(netmem); 61 61 ), 62 62 63 - TP_printk("page_pool=%p netmem=%p pfn=0x%lx release=%u", 63 + TP_printk("page_pool=%p netmem=%p is_net_iov=%lu pfn=0x%lx release=%u", 64 64 __entry->pool, (void *)__entry->netmem, 65 - __entry->pfn, __entry->release) 65 + __entry->netmem & NET_IOV, __entry->pfn, __entry->release) 66 66 ); 67 67 68 68 TRACE_EVENT(page_pool_state_hold, ··· 83 83 __entry->pool = pool; 84 84 __entry->netmem = (__force unsigned long)netmem; 85 85 __entry->hold = hold; 86 - __entry->pfn = netmem_to_pfn(netmem); 86 + __entry->pfn = netmem_pfn_trace(netmem); 87 87 ), 88 88 89 - TP_printk("page_pool=%p netmem=%p pfn=0x%lx hold=%u", 89 + TP_printk("page_pool=%p netmem=%p is_net_iov=%lu, pfn=0x%lx hold=%u", 90 90 __entry->pool, (void *)__entry->netmem, 91 - __entry->pfn, __entry->hold) 91 + __entry->netmem & NET_IOV, __entry->pfn, __entry->hold) 92 92 ); 93 93 94 94 TRACE_EVENT(page_pool_update_nid,
+7
net/core/devmem.c
··· 18 18 #include <trace/events/page_pool.h> 19 19 20 20 #include "devmem.h" 21 + #include "page_pool_priv.h" 21 22 22 23 /* Device memory support */ 23 24 ··· 82 81 offset = dma_addr - owner->base_dma_addr; 83 82 index = offset / PAGE_SIZE; 84 83 niov = &owner->niovs[index]; 84 + 85 + niov->pp_magic = 0; 86 + niov->pp = NULL; 87 + atomic_long_set(&niov->pp_ref_count, 0); 85 88 86 89 return niov; 87 90 } ··· 274 269 for (i = 0; i < owner->num_niovs; i++) { 275 270 niov = &owner->niovs[i]; 276 271 niov->owner = owner; 272 + page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov), 273 + net_devmem_get_dma_addr(niov)); 277 274 } 278 275 279 276 virtual += len;
+31
net/core/netmem_priv.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + 3 + #ifndef __NETMEM_PRIV_H 4 + #define __NETMEM_PRIV_H 5 + 6 + static inline unsigned long netmem_get_pp_magic(netmem_ref netmem) 7 + { 8 + return __netmem_clear_lsb(netmem)->pp_magic; 9 + } 10 + 11 + static inline void netmem_or_pp_magic(netmem_ref netmem, unsigned long pp_magic) 12 + { 13 + __netmem_clear_lsb(netmem)->pp_magic |= pp_magic; 14 + } 15 + 16 + static inline void netmem_clear_pp_magic(netmem_ref netmem) 17 + { 18 + __netmem_clear_lsb(netmem)->pp_magic = 0; 19 + } 20 + 21 + static inline void netmem_set_pp(netmem_ref netmem, struct page_pool *pool) 22 + { 23 + __netmem_clear_lsb(netmem)->pp = pool; 24 + } 25 + 26 + static inline void netmem_set_dma_addr(netmem_ref netmem, 27 + unsigned long dma_addr) 28 + { 29 + __netmem_clear_lsb(netmem)->dma_addr = dma_addr; 30 + } 31 + #endif
+13 -12
net/core/page_pool.c
··· 24 24 25 25 #include <trace/events/page_pool.h> 26 26 27 + #include "netmem_priv.h" 27 28 #include "page_pool_priv.h" 29 + 30 + DEFINE_STATIC_KEY_FALSE(page_pool_mem_providers); 28 31 29 32 #define DEFER_TIME (msecs_to_jiffies(1000)) 30 33 #define DEFER_WARN_INTERVAL (60 * HZ) ··· 361 358 if (unlikely(!netmem)) 362 359 break; 363 360 364 - if (likely(page_to_nid(netmem_to_page(netmem)) == pref_nid)) { 361 + if (likely(netmem_is_pref_nid(netmem, pref_nid))) { 365 362 pool->alloc.cache[pool->alloc.count++] = netmem; 366 363 } else { 367 364 /* NUMA mismatch; ··· 457 454 458 455 static void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem) 459 456 { 460 - struct page *page = netmem_to_page(netmem); 461 - 462 - page->pp = pool; 463 - page->pp_magic |= PP_SIGNATURE; 457 + netmem_set_pp(netmem, pool); 458 + netmem_or_pp_magic(netmem, PP_SIGNATURE); 464 459 465 460 /* Ensuring all pages have been split into one fragment initially: 466 461 * page_pool_set_pp_info() is only called once for every page when it ··· 473 472 474 473 static void page_pool_clear_pp_info(netmem_ref netmem) 475 474 { 476 - struct page *page = netmem_to_page(netmem); 477 - 478 - page->pp_magic = 0; 479 - page->pp = NULL; 475 + netmem_clear_pp_magic(netmem); 476 + netmem_set_pp(netmem, NULL); 480 477 } 481 478 482 479 static struct page *__page_pool_alloc_page_order(struct page_pool *pool, ··· 691 692 692 693 static bool __page_pool_page_can_be_recycled(netmem_ref netmem) 693 694 { 694 - return page_ref_count(netmem_to_page(netmem)) == 1 && 695 - !page_is_pfmemalloc(netmem_to_page(netmem)); 695 + return netmem_is_net_iov(netmem) || 696 + (page_ref_count(netmem_to_page(netmem)) == 1 && 697 + !page_is_pfmemalloc(netmem_to_page(netmem))); 696 698 } 697 699 698 700 /* If the page refcnt == 1, this will try to recycle the page. ··· 728 728 /* Page found as candidate for recycling */ 729 729 return netmem; 730 730 } 731 + 731 732 /* Fallback/non-XDP mode: API user have elevated refcnt. 732 733 * 733 734 * Many drivers split up the page into fragments, and some ··· 950 949 /* Empty recycle ring */ 951 950 while ((netmem = (__force netmem_ref)ptr_ring_consume_bh(&pool->ring))) { 952 951 /* Verify the refcnt invariant of cached pages */ 953 - if (!(page_ref_count(netmem_to_page(netmem)) == 1)) 952 + if (!(netmem_ref_count(netmem) == 1)) 954 953 pr_crit("%s() page_pool refcnt %d violation\n", 955 954 __func__, netmem_ref_count(netmem)); 956 955
+26
net/core/page_pool_priv.h
··· 3 3 #ifndef __PAGE_POOL_PRIV_H 4 4 #define __PAGE_POOL_PRIV_H 5 5 6 + #include <net/page_pool/helpers.h> 7 + 8 + #include "netmem_priv.h" 9 + 6 10 s32 page_pool_inflight(const struct page_pool *pool, bool strict); 7 11 8 12 int page_pool_list(struct page_pool *pool); 9 13 void page_pool_detached(struct page_pool *pool); 10 14 void page_pool_unlist(struct page_pool *pool); 15 + 16 + static inline bool 17 + page_pool_set_dma_addr_netmem(netmem_ref netmem, dma_addr_t addr) 18 + { 19 + if (PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA) { 20 + netmem_set_dma_addr(netmem, addr >> PAGE_SHIFT); 21 + 22 + /* We assume page alignment to shave off bottom bits, 23 + * if this "compression" doesn't work we need to drop. 24 + */ 25 + return addr != (dma_addr_t)netmem_get_dma_addr(netmem) 26 + << PAGE_SHIFT; 27 + } 28 + 29 + netmem_set_dma_addr(netmem, addr); 30 + return false; 31 + } 32 + 33 + static inline bool page_pool_set_dma_addr(struct page *page, dma_addr_t addr) 34 + { 35 + return page_pool_set_dma_addr_netmem(page_to_netmem(page), addr); 36 + } 11 37 12 38 #endif
+11 -12
net/core/skbuff.c
··· 88 88 #include <linux/textsearch.h> 89 89 90 90 #include "dev.h" 91 + #include "netmem_priv.h" 91 92 #include "sock_destructor.h" 92 93 93 94 #ifdef CONFIG_SKB_EXTENSIONS ··· 921 920 skb_get(list); 922 921 } 923 922 924 - static bool is_pp_page(struct page *page) 923 + static bool is_pp_netmem(netmem_ref netmem) 925 924 { 926 - return (page->pp_magic & ~0x3UL) == PP_SIGNATURE; 925 + return (netmem_get_pp_magic(netmem) & ~0x3UL) == PP_SIGNATURE; 927 926 } 928 927 929 928 int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb, ··· 1021 1020 #if IS_ENABLED(CONFIG_PAGE_POOL) 1022 1021 bool napi_pp_put_page(netmem_ref netmem) 1023 1022 { 1024 - struct page *page = netmem_to_page(netmem); 1025 - 1026 - page = compound_head(page); 1023 + netmem = netmem_compound_head(netmem); 1027 1024 1028 1025 /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation 1029 1026 * in order to preserve any existing bits, such as bit 0 for the ··· 1030 1031 * and page_is_pfmemalloc() is checked in __page_pool_put_page() 1031 1032 * to avoid recycling the pfmemalloc page. 1032 1033 */ 1033 - if (unlikely(!is_pp_page(page))) 1034 + if (unlikely(!is_pp_netmem(netmem))) 1034 1035 return false; 1035 1036 1036 - page_pool_put_full_netmem(page->pp, page_to_netmem(page), false); 1037 + page_pool_put_full_netmem(netmem_get_pp(netmem), netmem, false); 1037 1038 1038 1039 return true; 1039 1040 } ··· 1060 1061 static int skb_pp_frag_ref(struct sk_buff *skb) 1061 1062 { 1062 1063 struct skb_shared_info *shinfo; 1063 - struct page *head_page; 1064 + netmem_ref head_netmem; 1064 1065 int i; 1065 1066 1066 1067 if (!skb->pp_recycle) ··· 1069 1070 shinfo = skb_shinfo(skb); 1070 1071 1071 1072 for (i = 0; i < shinfo->nr_frags; i++) { 1072 - head_page = compound_head(skb_frag_page(&shinfo->frags[i])); 1073 - if (likely(is_pp_page(head_page))) 1074 - page_pool_ref_page(head_page); 1073 + head_netmem = netmem_compound_head(shinfo->frags[i].netmem); 1074 + if (likely(is_pp_netmem(head_netmem))) 1075 + page_pool_ref_netmem(head_netmem); 1075 1076 else 1076 - page_ref_inc(head_page); 1077 + page_ref_inc(netmem_to_page(head_netmem)); 1077 1078 } 1078 1079 return 0; 1079 1080 }