Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'page_pool-allow-direct-bulk-recycling'

Alexander Lobakin says:

====================
page_pool: allow direct bulk recycling

Previously, there was no reliable way to check whether it's safe to use
direct PP cache. The drivers were passing @allow_direct to the PP
recycling functions and that was it. Bulk recycling is used by
xdp_return_frame_bulk() on .ndo_xdp_xmit() frames completion where
the page origin is unknown, thus the direct recycling has never been
tried.
Now that we have at least 2 ways of checking if we're allowed to perform
direct recycling -- pool->p.napi (Jakub) and pool->cpuid (Lorenzo), we
can use them when doing bulk recycling as well. Just move that logic
from the skb core to the PP core and call it before
__page_pool_put_page() every time @allow_direct is false.
Under high .ndo_xdp_xmit() traffic load, the win is 2-3% Pps assuming
the sending driver uses xdp_return_frame_bulk() on Tx completion.
====================

Link: https://lore.kernel.org/r/20240329165507.3240110-1-aleksander.lobakin@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

+63 -61
+6 -6
include/linux/skbuff.h
··· 3510 3510 unsigned int headroom); 3511 3511 int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb, 3512 3512 struct bpf_prog *prog); 3513 - bool napi_pp_put_page(struct page *page, bool napi_safe); 3513 + bool napi_pp_put_page(struct page *page); 3514 3514 3515 3515 static inline void 3516 - skb_page_unref(const struct sk_buff *skb, struct page *page, bool napi_safe) 3516 + skb_page_unref(const struct sk_buff *skb, struct page *page) 3517 3517 { 3518 3518 #ifdef CONFIG_PAGE_POOL 3519 - if (skb->pp_recycle && napi_pp_put_page(page, napi_safe)) 3519 + if (skb->pp_recycle && napi_pp_put_page(page)) 3520 3520 return; 3521 3521 #endif 3522 3522 put_page(page); 3523 3523 } 3524 3524 3525 3525 static inline void 3526 - napi_frag_unref(skb_frag_t *frag, bool recycle, bool napi_safe) 3526 + napi_frag_unref(skb_frag_t *frag, bool recycle) 3527 3527 { 3528 3528 struct page *page = skb_frag_page(frag); 3529 3529 3530 3530 #ifdef CONFIG_PAGE_POOL 3531 - if (recycle && napi_pp_put_page(page, napi_safe)) 3531 + if (recycle && napi_pp_put_page(page)) 3532 3532 return; 3533 3533 #endif 3534 3534 put_page(page); ··· 3544 3544 */ 3545 3545 static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle) 3546 3546 { 3547 - napi_frag_unref(frag, recycle, false); 3547 + napi_frag_unref(frag, recycle); 3548 3548 } 3549 3549 3550 3550 /**
+33 -5
net/core/page_pool.c
··· 690 690 page_pool_dma_sync_for_device(pool, page, 691 691 dma_sync_size); 692 692 693 - if (allow_direct && in_softirq() && 694 - page_pool_recycle_in_cache(page, pool)) 693 + if (allow_direct && page_pool_recycle_in_cache(page, pool)) 695 694 return NULL; 696 695 697 696 /* Page found as candidate for recycling */ ··· 715 716 return NULL; 716 717 } 717 718 719 + static bool page_pool_napi_local(const struct page_pool *pool) 720 + { 721 + const struct napi_struct *napi; 722 + u32 cpuid; 723 + 724 + if (unlikely(!in_softirq())) 725 + return false; 726 + 727 + /* Allow direct recycle if we have reasons to believe that we are 728 + * in the same context as the consumer would run, so there's 729 + * no possible race. 730 + * __page_pool_put_page() makes sure we're not in hardirq context 731 + * and interrupts are enabled prior to accessing the cache. 732 + */ 733 + cpuid = smp_processor_id(); 734 + if (READ_ONCE(pool->cpuid) == cpuid) 735 + return true; 736 + 737 + napi = READ_ONCE(pool->p.napi); 738 + 739 + return napi && READ_ONCE(napi->list_owner) == cpuid; 740 + } 741 + 718 742 void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page, 719 743 unsigned int dma_sync_size, bool allow_direct) 720 744 { 745 + if (!allow_direct) 746 + allow_direct = page_pool_napi_local(pool); 747 + 721 748 page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct); 722 749 if (page && !page_pool_recycle_in_ring(pool, page)) { 723 750 /* Cache full, fallback to free pages */ ··· 772 747 int count) 773 748 { 774 749 int i, bulk_len = 0; 750 + bool allow_direct; 775 751 bool in_softirq; 752 + 753 + allow_direct = page_pool_napi_local(pool); 776 754 777 755 for (i = 0; i < count; i++) { 778 756 struct page *page = virt_to_head_page(data[i]); ··· 784 756 if (!page_pool_is_last_ref(page)) 785 757 continue; 786 758 787 - page = __page_pool_put_page(pool, page, -1, false); 759 + page = __page_pool_put_page(pool, page, -1, allow_direct); 788 760 /* Approved for bulk recycling in ptr_ring cache */ 789 761 if (page) 790 762 data[bulk_len++] = page; 791 763 } 792 764 793 - if (unlikely(!bulk_len)) 765 + if (!bulk_len) 794 766 return; 795 767 796 768 /* Bulk producer into ptr_ring page_pool cache */ ··· 997 969 static void page_pool_disable_direct_recycling(struct page_pool *pool) 998 970 { 999 971 /* Disable direct recycling based on pool->cpuid. 1000 - * Paired with READ_ONCE() in napi_pp_put_page(). 972 + * Paired with READ_ONCE() in page_pool_napi_local(). 1001 973 */ 1002 974 WRITE_ONCE(pool->cpuid, -1); 1003 975
+22 -48
net/core/skbuff.c
··· 1004 1004 EXPORT_SYMBOL(skb_cow_data_for_xdp); 1005 1005 1006 1006 #if IS_ENABLED(CONFIG_PAGE_POOL) 1007 - bool napi_pp_put_page(struct page *page, bool napi_safe) 1007 + bool napi_pp_put_page(struct page *page) 1008 1008 { 1009 - bool allow_direct = false; 1010 - struct page_pool *pp; 1011 - 1012 1009 page = compound_head(page); 1013 1010 1014 1011 /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation ··· 1018 1021 if (unlikely(!is_pp_page(page))) 1019 1022 return false; 1020 1023 1021 - pp = page->pp; 1022 - 1023 - /* Allow direct recycle if we have reasons to believe that we are 1024 - * in the same context as the consumer would run, so there's 1025 - * no possible race. 1026 - * __page_pool_put_page() makes sure we're not in hardirq context 1027 - * and interrupts are enabled prior to accessing the cache. 1028 - */ 1029 - if (napi_safe || in_softirq()) { 1030 - const struct napi_struct *napi = READ_ONCE(pp->p.napi); 1031 - unsigned int cpuid = smp_processor_id(); 1032 - 1033 - allow_direct = napi && READ_ONCE(napi->list_owner) == cpuid; 1034 - allow_direct |= READ_ONCE(pp->cpuid) == cpuid; 1035 - } 1036 - 1037 - /* Driver set this to memory recycling info. Reset it on recycle. 1038 - * This will *not* work for NIC using a split-page memory model. 1039 - * The page will be returned to the pool here regardless of the 1040 - * 'flipped' fragment being in use or not. 1041 - */ 1042 - page_pool_put_full_page(pp, page, allow_direct); 1024 + page_pool_put_full_page(page->pp, page, false); 1043 1025 1044 1026 return true; 1045 1027 } 1046 1028 EXPORT_SYMBOL(napi_pp_put_page); 1047 1029 #endif 1048 1030 1049 - static bool skb_pp_recycle(struct sk_buff *skb, void *data, bool napi_safe) 1031 + static bool skb_pp_recycle(struct sk_buff *skb, void *data) 1050 1032 { 1051 1033 if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle) 1052 1034 return false; 1053 - return napi_pp_put_page(virt_to_page(data), napi_safe); 1035 + return napi_pp_put_page(virt_to_page(data)); 1054 1036 } 1055 1037 1056 1038 /** ··· 1071 1095 kfree(head); 1072 1096 } 1073 1097 1074 - static void skb_free_head(struct sk_buff *skb, bool napi_safe) 1098 + static void skb_free_head(struct sk_buff *skb) 1075 1099 { 1076 1100 unsigned char *head = skb->head; 1077 1101 1078 1102 if (skb->head_frag) { 1079 - if (skb_pp_recycle(skb, head, napi_safe)) 1103 + if (skb_pp_recycle(skb, head)) 1080 1104 return; 1081 1105 skb_free_frag(head); 1082 1106 } else { ··· 1084 1108 } 1085 1109 } 1086 1110 1087 - static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason, 1088 - bool napi_safe) 1111 + static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason) 1089 1112 { 1090 1113 struct skb_shared_info *shinfo = skb_shinfo(skb); 1091 1114 int i; ··· 1101 1126 } 1102 1127 1103 1128 for (i = 0; i < shinfo->nr_frags; i++) 1104 - napi_frag_unref(&shinfo->frags[i], skb->pp_recycle, napi_safe); 1129 + napi_frag_unref(&shinfo->frags[i], skb->pp_recycle); 1105 1130 1106 1131 free_head: 1107 1132 if (shinfo->frag_list) 1108 1133 kfree_skb_list_reason(shinfo->frag_list, reason); 1109 1134 1110 - skb_free_head(skb, napi_safe); 1135 + skb_free_head(skb); 1111 1136 exit: 1112 1137 /* When we clone an SKB we copy the reycling bit. The pp_recycle 1113 1138 * bit is only set on the head though, so in order to avoid races ··· 1168 1193 } 1169 1194 1170 1195 /* Free everything but the sk_buff shell. */ 1171 - static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason, 1172 - bool napi_safe) 1196 + static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason) 1173 1197 { 1174 1198 skb_release_head_state(skb); 1175 1199 if (likely(skb->head)) 1176 - skb_release_data(skb, reason, napi_safe); 1200 + skb_release_data(skb, reason); 1177 1201 } 1178 1202 1179 1203 /** ··· 1186 1212 1187 1213 void __kfree_skb(struct sk_buff *skb) 1188 1214 { 1189 - skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED, false); 1215 + skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED); 1190 1216 kfree_skbmem(skb); 1191 1217 } 1192 1218 EXPORT_SYMBOL(__kfree_skb); ··· 1243 1269 return; 1244 1270 } 1245 1271 1246 - skb_release_all(skb, reason, false); 1272 + skb_release_all(skb, reason); 1247 1273 sa->skb_array[sa->skb_count++] = skb; 1248 1274 1249 1275 if (unlikely(sa->skb_count == KFREE_SKB_BULK_SIZE)) { ··· 1417 1443 void __consume_stateless_skb(struct sk_buff *skb) 1418 1444 { 1419 1445 trace_consume_skb(skb, __builtin_return_address(0)); 1420 - skb_release_data(skb, SKB_CONSUMED, false); 1446 + skb_release_data(skb, SKB_CONSUMED); 1421 1447 kfree_skbmem(skb); 1422 1448 } 1423 1449 ··· 1444 1470 1445 1471 void __napi_kfree_skb(struct sk_buff *skb, enum skb_drop_reason reason) 1446 1472 { 1447 - skb_release_all(skb, reason, true); 1473 + skb_release_all(skb, reason); 1448 1474 napi_skb_cache_put(skb); 1449 1475 } 1450 1476 ··· 1482 1508 return; 1483 1509 } 1484 1510 1485 - skb_release_all(skb, SKB_CONSUMED, !!budget); 1511 + skb_release_all(skb, SKB_CONSUMED); 1486 1512 napi_skb_cache_put(skb); 1487 1513 } 1488 1514 EXPORT_SYMBOL(napi_consume_skb); ··· 1613 1639 */ 1614 1640 struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src) 1615 1641 { 1616 - skb_release_all(dst, SKB_CONSUMED, false); 1642 + skb_release_all(dst, SKB_CONSUMED); 1617 1643 return __skb_clone(dst, src); 1618 1644 } 1619 1645 EXPORT_SYMBOL_GPL(skb_morph); ··· 2245 2271 if (skb_has_frag_list(skb)) 2246 2272 skb_clone_fraglist(skb); 2247 2273 2248 - skb_release_data(skb, SKB_CONSUMED, false); 2274 + skb_release_data(skb, SKB_CONSUMED); 2249 2275 } else { 2250 - skb_free_head(skb, false); 2276 + skb_free_head(skb); 2251 2277 } 2252 2278 off = (data + nhead) - skb->head; 2253 2279 ··· 6548 6574 skb_frag_ref(skb, i); 6549 6575 if (skb_has_frag_list(skb)) 6550 6576 skb_clone_fraglist(skb); 6551 - skb_release_data(skb, SKB_CONSUMED, false); 6577 + skb_release_data(skb, SKB_CONSUMED); 6552 6578 } else { 6553 6579 /* we can reuse existing recount- all we did was 6554 6580 * relocate values 6555 6581 */ 6556 - skb_free_head(skb, false); 6582 + skb_free_head(skb); 6557 6583 } 6558 6584 6559 6585 skb->head = data; ··· 6688 6714 skb_kfree_head(data, size); 6689 6715 return -ENOMEM; 6690 6716 } 6691 - skb_release_data(skb, SKB_CONSUMED, false); 6717 + skb_release_data(skb, SKB_CONSUMED); 6692 6718 6693 6719 skb->head = data; 6694 6720 skb->head_frag = 0;
+1 -1
net/ipv4/esp4.c
··· 114 114 */ 115 115 if (req->src != req->dst) 116 116 for (sg = sg_next(req->src); sg; sg = sg_next(sg)) 117 - skb_page_unref(skb, sg_page(sg), false); 117 + skb_page_unref(skb, sg_page(sg)); 118 118 } 119 119 120 120 #ifdef CONFIG_INET_ESPINTCP
+1 -1
net/ipv6/esp6.c
··· 131 131 */ 132 132 if (req->src != req->dst) 133 133 for (sg = sg_next(req->src); sg; sg = sg_next(sg)) 134 - skb_page_unref(skb, sg_page(sg), false); 134 + skb_page_unref(skb, sg_page(sg)); 135 135 } 136 136 137 137 #ifdef CONFIG_INET6_ESPINTCP