Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

net: remove gfp_mask from napi_alloc_skb()

__napi_alloc_skb() is napi_alloc_skb() with the added flexibility
of choosing gfp_mask. This is a NAPI function, so GFP_ATOMIC is
implied. The only practical choice the caller has is whether to
set __GFP_NOWARN. But that's a false choice, too, allocation failures
in atomic context will happen, and printing warnings in logs,
effectively for a packet drop, is both too much and very likely
non-actionable.

This leads me to a conclusion that most uses of napi_alloc_skb()
are simply misguided, and should use __GFP_NOWARN in the first
place. We also have a "standard" way of reporting allocation
failures via the queue stat API (qstats::rx-alloc-fail).

The direct motivation for this patch is that one of the drivers
used at Meta calls napi_alloc_skb() (so prior to this patch without
__GFP_NOWARN), and the resulting OOM warning is the top networking
warning in our fleet.

Reviewed-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/20240327040213.3153864-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

+18 -36
+1 -1
Documentation/mm/page_frags.rst
··· 25 25 The network stack uses two separate caches per CPU to handle fragment 26 26 allocation. The netdev_alloc_cache is used by callers making use of the 27 27 netdev_alloc_frag and __netdev_alloc_skb calls. The napi_alloc_cache is 28 - used by callers of the __napi_alloc_frag and __napi_alloc_skb calls. The 28 + used by callers of the __napi_alloc_frag and napi_alloc_skb calls. The 29 29 main difference between these two calls is the context in which they may be 30 30 called. The "netdev" prefixed functions are usable in any context as these 31 31 functions will disable interrupts, while the "napi" prefixed functions are
+1 -1
Documentation/translations/zh_CN/mm/page_frags.rst
··· 25 25 26 26 网络堆栈在每个CPU使用两个独立的缓存来处理碎片分配。netdev_alloc_cache被使用 27 27 netdev_alloc_frag和__netdev_alloc_skb调用的调用者使用。napi_alloc_cache 28 - 被调用__napi_alloc_frag和__napi_alloc_skb的调用者使用。这两个调用的主要区别是 28 + 被调用__napi_alloc_frag和napi_alloc_skb的调用者使用。这两个调用的主要区别是 29 29 它们可能被调用的环境。“netdev” 前缀的函数可以在任何上下文中使用,因为这些函数 30 30 将禁用中断,而 ”napi“ 前缀的函数只可以在softirq上下文中使用。 31 31
+1 -3
drivers/net/ethernet/intel/i40e/i40e_txrx.c
··· 2144 2144 */ 2145 2145 2146 2146 /* allocate a skb to store the frags */ 2147 - skb = __napi_alloc_skb(&rx_ring->q_vector->napi, 2148 - I40E_RX_HDR_SIZE, 2149 - GFP_ATOMIC | __GFP_NOWARN); 2147 + skb = napi_alloc_skb(&rx_ring->q_vector->napi, I40E_RX_HDR_SIZE); 2150 2148 if (unlikely(!skb)) 2151 2149 return NULL; 2152 2150
+1 -2
drivers/net/ethernet/intel/i40e/i40e_xsk.c
··· 301 301 net_prefetch(xdp->data_meta); 302 302 303 303 /* allocate a skb to store the frags */ 304 - skb = __napi_alloc_skb(&rx_ring->q_vector->napi, totalsize, 305 - GFP_ATOMIC | __GFP_NOWARN); 304 + skb = napi_alloc_skb(&rx_ring->q_vector->napi, totalsize); 306 305 if (unlikely(!skb)) 307 306 goto out; 308 307
+1 -3
drivers/net/ethernet/intel/iavf/iavf_txrx.c
··· 1334 1334 net_prefetch(va); 1335 1335 1336 1336 /* allocate a skb to store the frags */ 1337 - skb = __napi_alloc_skb(&rx_ring->q_vector->napi, 1338 - IAVF_RX_HDR_SIZE, 1339 - GFP_ATOMIC | __GFP_NOWARN); 1337 + skb = napi_alloc_skb(&rx_ring->q_vector->napi, IAVF_RX_HDR_SIZE); 1340 1338 if (unlikely(!skb)) 1341 1339 return NULL; 1342 1340
+1 -2
drivers/net/ethernet/intel/ice/ice_txrx.c
··· 1051 1051 } 1052 1052 1053 1053 /* allocate a skb to store the frags */ 1054 - skb = __napi_alloc_skb(&rx_ring->q_vector->napi, ICE_RX_HDR_SIZE, 1055 - GFP_ATOMIC | __GFP_NOWARN); 1054 + skb = napi_alloc_skb(&rx_ring->q_vector->napi, ICE_RX_HDR_SIZE); 1056 1055 if (unlikely(!skb)) 1057 1056 return NULL; 1058 1057
+1 -2
drivers/net/ethernet/intel/ice/ice_xsk.c
··· 555 555 } 556 556 net_prefetch(xdp->data_meta); 557 557 558 - skb = __napi_alloc_skb(&rx_ring->q_vector->napi, totalsize, 559 - GFP_ATOMIC | __GFP_NOWARN); 558 + skb = napi_alloc_skb(&rx_ring->q_vector->napi, totalsize); 560 559 if (unlikely(!skb)) 561 560 return NULL; 562 561
+2 -3
drivers/net/ethernet/intel/idpf/idpf_txrx.c
··· 3005 3005 /* prefetch first cache line of first page */ 3006 3006 net_prefetch(va); 3007 3007 /* allocate a skb to store the frags */ 3008 - skb = __napi_alloc_skb(&rxq->q_vector->napi, IDPF_RX_HDR_SIZE, 3009 - GFP_ATOMIC); 3008 + skb = napi_alloc_skb(&rxq->q_vector->napi, IDPF_RX_HDR_SIZE); 3010 3009 if (unlikely(!skb)) { 3011 3010 idpf_rx_put_page(rx_buf); 3012 3011 ··· 3059 3060 struct sk_buff *skb; 3060 3061 3061 3062 /* allocate a skb to store the frags */ 3062 - skb = __napi_alloc_skb(&rxq->q_vector->napi, size, GFP_ATOMIC); 3063 + skb = napi_alloc_skb(&rxq->q_vector->napi, size); 3063 3064 if (unlikely(!skb)) 3064 3065 return NULL; 3065 3066
+1 -2
drivers/net/ethernet/intel/igc/igc_main.c
··· 2712 2712 2713 2713 net_prefetch(xdp->data_meta); 2714 2714 2715 - skb = __napi_alloc_skb(&ring->q_vector->napi, totalsize, 2716 - GFP_ATOMIC | __GFP_NOWARN); 2715 + skb = napi_alloc_skb(&ring->q_vector->napi, totalsize); 2717 2716 if (unlikely(!skb)) 2718 2717 return NULL; 2719 2718
+1 -2
drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c
··· 220 220 net_prefetch(xdp->data_meta); 221 221 222 222 /* allocate a skb to store the frags */ 223 - skb = __napi_alloc_skb(&rx_ring->q_vector->napi, totalsize, 224 - GFP_ATOMIC | __GFP_NOWARN); 223 + skb = napi_alloc_skb(&rx_ring->q_vector->napi, totalsize); 225 224 if (unlikely(!skb)) 226 225 return NULL; 227 226
+2 -3
drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
··· 5109 5109 unsigned int datasize = xdp->data_end - xdp->data; 5110 5110 struct sk_buff *skb; 5111 5111 5112 - skb = __napi_alloc_skb(&ch->rxtx_napi, 5113 - xdp->data_end - xdp->data_hard_start, 5114 - GFP_ATOMIC | __GFP_NOWARN); 5112 + skb = napi_alloc_skb(&ch->rxtx_napi, 5113 + xdp->data_end - xdp->data_hard_start); 5115 5114 if (unlikely(!skb)) 5116 5115 return NULL; 5117 5116
+1 -7
include/linux/skbuff.h
··· 3350 3350 return __napi_alloc_frag_align(fragsz, -align); 3351 3351 } 3352 3352 3353 - struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, 3354 - unsigned int length, gfp_t gfp_mask); 3355 - static inline struct sk_buff *napi_alloc_skb(struct napi_struct *napi, 3356 - unsigned int length) 3357 - { 3358 - return __napi_alloc_skb(napi, length, GFP_ATOMIC); 3359 - } 3353 + struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int length); 3360 3354 void napi_consume_skb(struct sk_buff *skb, int budget); 3361 3355 3362 3356 void napi_skb_free_stolen_head(struct sk_buff *skb);
+4 -5
net/core/skbuff.c
··· 775 775 EXPORT_SYMBOL(__netdev_alloc_skb); 776 776 777 777 /** 778 - * __napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance 778 + * napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance 779 779 * @napi: napi instance this buffer was allocated for 780 780 * @len: length to allocate 781 - * @gfp_mask: get_free_pages mask, passed to alloc_skb and alloc_pages 782 781 * 783 782 * Allocate a new sk_buff for use in NAPI receive. This buffer will 784 783 * attempt to allocate the head from a special reserved region used ··· 786 787 * 787 788 * %NULL is returned if there is no free memory. 788 789 */ 789 - struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, 790 - gfp_t gfp_mask) 790 + struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len) 791 791 { 792 + gfp_t gfp_mask = GFP_ATOMIC | __GFP_NOWARN; 792 793 struct napi_alloc_cache *nc; 793 794 struct sk_buff *skb; 794 795 bool pfmemalloc; ··· 859 860 skb_fail: 860 861 return skb; 861 862 } 862 - EXPORT_SYMBOL(__napi_alloc_skb); 863 + EXPORT_SYMBOL(napi_alloc_skb); 863 864 864 865 void skb_add_rx_frag_netmem(struct sk_buff *skb, int i, netmem_ref netmem, 865 866 int off, int size, unsigned int truesize)