net/core/page_pool.c at v5.12-rc5 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / net / core / page_pool.c
at v5.12-rc5 585 lines 16 kB view raw
  1/* SPDX-License-Identifier: GPL-2.0
  2 *
  3 * page_pool.c
  4 *	Author:	Jesper Dangaard Brouer <netoptimizer@brouer.com>
  5 *	Copyright (C) 2016 Red Hat, Inc.
  6 */
  7
  8#include <linux/types.h>
  9#include <linux/kernel.h>
 10#include <linux/slab.h>
 11#include <linux/device.h>
 12
 13#include <net/page_pool.h>
 14#include <net/xdp.h>
 15
 16#include <linux/dma-direction.h>
 17#include <linux/dma-mapping.h>
 18#include <linux/page-flags.h>
 19#include <linux/mm.h> /* for __put_page() */
 20
 21#include <trace/events/page_pool.h>
 22
 23#define DEFER_TIME (msecs_to_jiffies(1000))
 24#define DEFER_WARN_INTERVAL (60 * HZ)
 25
 26static int page_pool_init(struct page_pool *pool,
 27			  const struct page_pool_params *params)
 28{
 29	unsigned int ring_qsize = 1024; /* Default */
 30
 31	memcpy(&pool->p, params, sizeof(pool->p));
 32
 33	/* Validate only known flags were used */
 34	if (pool->p.flags & ~(PP_FLAG_ALL))
 35		return -EINVAL;
 36
 37	if (pool->p.pool_size)
 38		ring_qsize = pool->p.pool_size;
 39
 40	/* Sanity limit mem that can be pinned down */
 41	if (ring_qsize > 32768)
 42		return -E2BIG;
 43
 44	/* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL.
 45	 * DMA_BIDIRECTIONAL is for allowing page used for DMA sending,
 46	 * which is the XDP_TX use-case.
 47	 */
 48	if (pool->p.flags & PP_FLAG_DMA_MAP) {
 49		if ((pool->p.dma_dir != DMA_FROM_DEVICE) &&
 50		    (pool->p.dma_dir != DMA_BIDIRECTIONAL))
 51			return -EINVAL;
 52	}
 53
 54	if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) {
 55		/* In order to request DMA-sync-for-device the page
 56		 * needs to be mapped
 57		 */
 58		if (!(pool->p.flags & PP_FLAG_DMA_MAP))
 59			return -EINVAL;
 60
 61		if (!pool->p.max_len)
 62			return -EINVAL;
 63
 64		/* pool->p.offset has to be set according to the address
 65		 * offset used by the DMA engine to start copying rx data
 66		 */
 67	}
 68
 69	if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0)
 70		return -ENOMEM;
 71
 72	atomic_set(&pool->pages_state_release_cnt, 0);
 73
 74	/* Driver calling page_pool_create() also call page_pool_destroy() */
 75	refcount_set(&pool->user_cnt, 1);
 76
 77	if (pool->p.flags & PP_FLAG_DMA_MAP)
 78		get_device(pool->p.dev);
 79
 80	return 0;
 81}
 82
 83struct page_pool *page_pool_create(const struct page_pool_params *params)
 84{
 85	struct page_pool *pool;
 86	int err;
 87
 88	pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid);
 89	if (!pool)
 90		return ERR_PTR(-ENOMEM);
 91
 92	err = page_pool_init(pool, params);
 93	if (err < 0) {
 94		pr_warn("%s() gave up with errno %d\n", __func__, err);
 95		kfree(pool);
 96		return ERR_PTR(err);
 97	}
 98
 99	return pool;
100}
101EXPORT_SYMBOL(page_pool_create);
102
103static void page_pool_return_page(struct page_pool *pool, struct page *page);
104
105noinline
106static struct page *page_pool_refill_alloc_cache(struct page_pool *pool)
107{
108	struct ptr_ring *r = &pool->ring;
109	struct page *page;
110	int pref_nid; /* preferred NUMA node */
111
112	/* Quicker fallback, avoid locks when ring is empty */
113	if (__ptr_ring_empty(r))
114		return NULL;
115
116	/* Softirq guarantee CPU and thus NUMA node is stable. This,
117	 * assumes CPU refilling driver RX-ring will also run RX-NAPI.
118	 */
119#ifdef CONFIG_NUMA
120	pref_nid = (pool->p.nid == NUMA_NO_NODE) ? numa_mem_id() : pool->p.nid;
121#else
122	/* Ignore pool->p.nid setting if !CONFIG_NUMA, helps compiler */
123	pref_nid = numa_mem_id(); /* will be zero like page_to_nid() */
124#endif
125
126	/* Slower-path: Get pages from locked ring queue */
127	spin_lock(&r->consumer_lock);
128
129	/* Refill alloc array, but only if NUMA match */
130	do {
131		page = __ptr_ring_consume(r);
132		if (unlikely(!page))
133			break;
134
135		if (likely(page_to_nid(page) == pref_nid)) {
136			pool->alloc.cache[pool->alloc.count++] = page;
137		} else {
138			/* NUMA mismatch;
139			 * (1) release 1 page to page-allocator and
140			 * (2) break out to fallthrough to alloc_pages_node.
141			 * This limit stress on page buddy alloactor.
142			 */
143			page_pool_return_page(pool, page);
144			page = NULL;
145			break;
146		}
147	} while (pool->alloc.count < PP_ALLOC_CACHE_REFILL);
148
149	/* Return last page */
150	if (likely(pool->alloc.count > 0))
151		page = pool->alloc.cache[--pool->alloc.count];
152
153	spin_unlock(&r->consumer_lock);
154	return page;
155}
156
157/* fast path */
158static struct page *__page_pool_get_cached(struct page_pool *pool)
159{
160	struct page *page;
161
162	/* Caller MUST guarantee safe non-concurrent access, e.g. softirq */
163	if (likely(pool->alloc.count)) {
164		/* Fast-path */
165		page = pool->alloc.cache[--pool->alloc.count];
166	} else {
167		page = page_pool_refill_alloc_cache(pool);
168	}
169
170	return page;
171}
172
173static void page_pool_dma_sync_for_device(struct page_pool *pool,
174					  struct page *page,
175					  unsigned int dma_sync_size)
176{
177	dma_sync_size = min(dma_sync_size, pool->p.max_len);
178	dma_sync_single_range_for_device(pool->p.dev, page->dma_addr,
179					 pool->p.offset, dma_sync_size,
180					 pool->p.dma_dir);
181}
182
183/* slow path */
184noinline
185static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
186						 gfp_t _gfp)
187{
188	struct page *page;
189	gfp_t gfp = _gfp;
190	dma_addr_t dma;
191
192	/* We could always set __GFP_COMP, and avoid this branch, as
193	 * prep_new_page() can handle order-0 with __GFP_COMP.
194	 */
195	if (pool->p.order)
196		gfp |= __GFP_COMP;
197
198	/* FUTURE development:
199	 *
200	 * Current slow-path essentially falls back to single page
201	 * allocations, which doesn't improve performance.  This code
202	 * need bulk allocation support from the page allocator code.
203	 */
204
205	/* Cache was empty, do real allocation */
206#ifdef CONFIG_NUMA
207	page = alloc_pages_node(pool->p.nid, gfp, pool->p.order);
208#else
209	page = alloc_pages(gfp, pool->p.order);
210#endif
211	if (!page)
212		return NULL;
213
214	if (!(pool->p.flags & PP_FLAG_DMA_MAP))
215		goto skip_dma_map;
216
217	/* Setup DMA mapping: use 'struct page' area for storing DMA-addr
218	 * since dma_addr_t can be either 32 or 64 bits and does not always fit
219	 * into page private data (i.e 32bit cpu with 64bit DMA caps)
220	 * This mapping is kept for lifetime of page, until leaving pool.
221	 */
222	dma = dma_map_page_attrs(pool->p.dev, page, 0,
223				 (PAGE_SIZE << pool->p.order),
224				 pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC);
225	if (dma_mapping_error(pool->p.dev, dma)) {
226		put_page(page);
227		return NULL;
228	}
229	page->dma_addr = dma;
230
231	if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
232		page_pool_dma_sync_for_device(pool, page, pool->p.max_len);
233
234skip_dma_map:
235	/* Track how many pages are held 'in-flight' */
236	pool->pages_state_hold_cnt++;
237
238	trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt);
239
240	/* When page just alloc'ed is should/must have refcnt 1. */
241	return page;
242}
243
244/* For using page_pool replace: alloc_pages() API calls, but provide
245 * synchronization guarantee for allocation side.
246 */
247struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)
248{
249	struct page *page;
250
251	/* Fast-path: Get a page from cache */
252	page = __page_pool_get_cached(pool);
253	if (page)
254		return page;
255
256	/* Slow-path: cache empty, do real allocation */
257	page = __page_pool_alloc_pages_slow(pool, gfp);
258	return page;
259}
260EXPORT_SYMBOL(page_pool_alloc_pages);
261
262/* Calculate distance between two u32 values, valid if distance is below 2^(31)
263 *  https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution
264 */
265#define _distance(a, b)	(s32)((a) - (b))
266
267static s32 page_pool_inflight(struct page_pool *pool)
268{
269	u32 release_cnt = atomic_read(&pool->pages_state_release_cnt);
270	u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt);
271	s32 inflight;
272
273	inflight = _distance(hold_cnt, release_cnt);
274
275	trace_page_pool_release(pool, inflight, hold_cnt, release_cnt);
276	WARN(inflight < 0, "Negative(%d) inflight packet-pages", inflight);
277
278	return inflight;
279}
280
281/* Disconnects a page (from a page_pool).  API users can have a need
282 * to disconnect a page (from a page_pool), to allow it to be used as
283 * a regular page (that will eventually be returned to the normal
284 * page-allocator via put_page).
285 */
286void page_pool_release_page(struct page_pool *pool, struct page *page)
287{
288	dma_addr_t dma;
289	int count;
290
291	if (!(pool->p.flags & PP_FLAG_DMA_MAP))
292		/* Always account for inflight pages, even if we didn't
293		 * map them
294		 */
295		goto skip_dma_unmap;
296
297	dma = page->dma_addr;
298
299	/* When page is unmapped, it cannot be returned our pool */
300	dma_unmap_page_attrs(pool->p.dev, dma,
301			     PAGE_SIZE << pool->p.order, pool->p.dma_dir,
302			     DMA_ATTR_SKIP_CPU_SYNC);
303	page->dma_addr = 0;
304skip_dma_unmap:
305	/* This may be the last page returned, releasing the pool, so
306	 * it is not safe to reference pool afterwards.
307	 */
308	count = atomic_inc_return(&pool->pages_state_release_cnt);
309	trace_page_pool_state_release(pool, page, count);
310}
311EXPORT_SYMBOL(page_pool_release_page);
312
313/* Return a page to the page allocator, cleaning up our state */
314static void page_pool_return_page(struct page_pool *pool, struct page *page)
315{
316	page_pool_release_page(pool, page);
317
318	put_page(page);
319	/* An optimization would be to call __free_pages(page, pool->p.order)
320	 * knowing page is not part of page-cache (thus avoiding a
321	 * __page_cache_release() call).
322	 */
323}
324
325static bool page_pool_recycle_in_ring(struct page_pool *pool, struct page *page)
326{
327	int ret;
328	/* BH protection not needed if current is serving softirq */
329	if (in_serving_softirq())
330		ret = ptr_ring_produce(&pool->ring, page);
331	else
332		ret = ptr_ring_produce_bh(&pool->ring, page);
333
334	return (ret == 0) ? true : false;
335}
336
337/* Only allow direct recycling in special circumstances, into the
338 * alloc side cache.  E.g. during RX-NAPI processing for XDP_DROP use-case.
339 *
340 * Caller must provide appropriate safe context.
341 */
342static bool page_pool_recycle_in_cache(struct page *page,
343				       struct page_pool *pool)
344{
345	if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE))
346		return false;
347
348	/* Caller MUST have verified/know (page_ref_count(page) == 1) */
349	pool->alloc.cache[pool->alloc.count++] = page;
350	return true;
351}
352
353/* If the page refcnt == 1, this will try to recycle the page.
354 * if PP_FLAG_DMA_SYNC_DEV is set, we'll try to sync the DMA area for
355 * the configured size min(dma_sync_size, pool->max_len).
356 * If the page refcnt != 1, then the page will be returned to memory
357 * subsystem.
358 */
359static __always_inline struct page *
360__page_pool_put_page(struct page_pool *pool, struct page *page,
361		     unsigned int dma_sync_size, bool allow_direct)
362{
363	/* This allocator is optimized for the XDP mode that uses
364	 * one-frame-per-page, but have fallbacks that act like the
365	 * regular page allocator APIs.
366	 *
367	 * refcnt == 1 means page_pool owns page, and can recycle it.
368	 *
369	 * page is NOT reusable when allocated when system is under
370	 * some pressure. (page_is_pfmemalloc)
371	 */
372	if (likely(page_ref_count(page) == 1 && !page_is_pfmemalloc(page))) {
373		/* Read barrier done in page_ref_count / READ_ONCE */
374
375		if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
376			page_pool_dma_sync_for_device(pool, page,
377						      dma_sync_size);
378
379		if (allow_direct && in_serving_softirq() &&
380		    page_pool_recycle_in_cache(page, pool))
381			return NULL;
382
383		/* Page found as candidate for recycling */
384		return page;
385	}
386	/* Fallback/non-XDP mode: API user have elevated refcnt.
387	 *
388	 * Many drivers split up the page into fragments, and some
389	 * want to keep doing this to save memory and do refcnt based
390	 * recycling. Support this use case too, to ease drivers
391	 * switching between XDP/non-XDP.
392	 *
393	 * In-case page_pool maintains the DMA mapping, API user must
394	 * call page_pool_put_page once.  In this elevated refcnt
395	 * case, the DMA is unmapped/released, as driver is likely
396	 * doing refcnt based recycle tricks, meaning another process
397	 * will be invoking put_page.
398	 */
399	/* Do not replace this with page_pool_return_page() */
400	page_pool_release_page(pool, page);
401	put_page(page);
402
403	return NULL;
404}
405
406void page_pool_put_page(struct page_pool *pool, struct page *page,
407			unsigned int dma_sync_size, bool allow_direct)
408{
409	page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct);
410	if (page && !page_pool_recycle_in_ring(pool, page)) {
411		/* Cache full, fallback to free pages */
412		page_pool_return_page(pool, page);
413	}
414}
415EXPORT_SYMBOL(page_pool_put_page);
416
417/* Caller must not use data area after call, as this function overwrites it */
418void page_pool_put_page_bulk(struct page_pool *pool, void **data,
419			     int count)
420{
421	int i, bulk_len = 0;
422
423	for (i = 0; i < count; i++) {
424		struct page *page = virt_to_head_page(data[i]);
425
426		page = __page_pool_put_page(pool, page, -1, false);
427		/* Approved for bulk recycling in ptr_ring cache */
428		if (page)
429			data[bulk_len++] = page;
430	}
431
432	if (unlikely(!bulk_len))
433		return;
434
435	/* Bulk producer into ptr_ring page_pool cache */
436	page_pool_ring_lock(pool);
437	for (i = 0; i < bulk_len; i++) {
438		if (__ptr_ring_produce(&pool->ring, data[i]))
439			break; /* ring full */
440	}
441	page_pool_ring_unlock(pool);
442
443	/* Hopefully all pages was return into ptr_ring */
444	if (likely(i == bulk_len))
445		return;
446
447	/* ptr_ring cache full, free remaining pages outside producer lock
448	 * since put_page() with refcnt == 1 can be an expensive operation
449	 */
450	for (; i < bulk_len; i++)
451		page_pool_return_page(pool, data[i]);
452}
453EXPORT_SYMBOL(page_pool_put_page_bulk);
454
455static void page_pool_empty_ring(struct page_pool *pool)
456{
457	struct page *page;
458
459	/* Empty recycle ring */
460	while ((page = ptr_ring_consume_bh(&pool->ring))) {
461		/* Verify the refcnt invariant of cached pages */
462		if (!(page_ref_count(page) == 1))
463			pr_crit("%s() page_pool refcnt %d violation\n",
464				__func__, page_ref_count(page));
465
466		page_pool_return_page(pool, page);
467	}
468}
469
470static void page_pool_free(struct page_pool *pool)
471{
472	if (pool->disconnect)
473		pool->disconnect(pool);
474
475	ptr_ring_cleanup(&pool->ring, NULL);
476
477	if (pool->p.flags & PP_FLAG_DMA_MAP)
478		put_device(pool->p.dev);
479
480	kfree(pool);
481}
482
483static void page_pool_empty_alloc_cache_once(struct page_pool *pool)
484{
485	struct page *page;
486
487	if (pool->destroy_cnt)
488		return;
489
490	/* Empty alloc cache, assume caller made sure this is
491	 * no-longer in use, and page_pool_alloc_pages() cannot be
492	 * call concurrently.
493	 */
494	while (pool->alloc.count) {
495		page = pool->alloc.cache[--pool->alloc.count];
496		page_pool_return_page(pool, page);
497	}
498}
499
500static void page_pool_scrub(struct page_pool *pool)
501{
502	page_pool_empty_alloc_cache_once(pool);
503	pool->destroy_cnt++;
504
505	/* No more consumers should exist, but producers could still
506	 * be in-flight.
507	 */
508	page_pool_empty_ring(pool);
509}
510
511static int page_pool_release(struct page_pool *pool)
512{
513	int inflight;
514
515	page_pool_scrub(pool);
516	inflight = page_pool_inflight(pool);
517	if (!inflight)
518		page_pool_free(pool);
519
520	return inflight;
521}
522
523static void page_pool_release_retry(struct work_struct *wq)
524{
525	struct delayed_work *dwq = to_delayed_work(wq);
526	struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw);
527	int inflight;
528
529	inflight = page_pool_release(pool);
530	if (!inflight)
531		return;
532
533	/* Periodic warning */
534	if (time_after_eq(jiffies, pool->defer_warn)) {
535		int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ;
536
537		pr_warn("%s() stalled pool shutdown %d inflight %d sec\n",
538			__func__, inflight, sec);
539		pool->defer_warn = jiffies + DEFER_WARN_INTERVAL;
540	}
541
542	/* Still not ready to be disconnected, retry later */
543	schedule_delayed_work(&pool->release_dw, DEFER_TIME);
544}
545
546void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *))
547{
548	refcount_inc(&pool->user_cnt);
549	pool->disconnect = disconnect;
550}
551
552void page_pool_destroy(struct page_pool *pool)
553{
554	if (!pool)
555		return;
556
557	if (!page_pool_put(pool))
558		return;
559
560	if (!page_pool_release(pool))
561		return;
562
563	pool->defer_start = jiffies;
564	pool->defer_warn  = jiffies + DEFER_WARN_INTERVAL;
565
566	INIT_DELAYED_WORK(&pool->release_dw, page_pool_release_retry);
567	schedule_delayed_work(&pool->release_dw, DEFER_TIME);
568}
569EXPORT_SYMBOL(page_pool_destroy);
570
571/* Caller must provide appropriate safe context, e.g. NAPI. */
572void page_pool_update_nid(struct page_pool *pool, int new_nid)
573{
574	struct page *page;
575
576	trace_page_pool_update_nid(pool, new_nid);
577	pool->p.nid = new_nid;
578
579	/* Flush pool alloc cache, as refill will check NUMA node */
580	while (pool->alloc.count) {
581		page = pool->alloc.cache[--pool->alloc.count];
582		page_pool_return_page(pool, page);
583	}
584}
585EXPORT_SYMBOL(page_pool_update_nid);