net/core/skbuff.c at v4.11-rc8 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / net / core / skbuff.c
at v4.11-rc8 4997 lines 126 kB view raw
   1/*
   2 *	Routines having to do with the 'struct sk_buff' memory handlers.
   3 *
   4 *	Authors:	Alan Cox <alan@lxorguk.ukuu.org.uk>
   5 *			Florian La Roche <rzsfl@rz.uni-sb.de>
   6 *
   7 *	Fixes:
   8 *		Alan Cox	:	Fixed the worst of the load
   9 *					balancer bugs.
  10 *		Dave Platt	:	Interrupt stacking fix.
  11 *	Richard Kooijman	:	Timestamp fixes.
  12 *		Alan Cox	:	Changed buffer format.
  13 *		Alan Cox	:	destructor hook for AF_UNIX etc.
  14 *		Linus Torvalds	:	Better skb_clone.
  15 *		Alan Cox	:	Added skb_copy.
  16 *		Alan Cox	:	Added all the changed routines Linus
  17 *					only put in the headers
  18 *		Ray VanTassle	:	Fixed --skb->lock in free
  19 *		Alan Cox	:	skb_copy copy arp field
  20 *		Andi Kleen	:	slabified it.
  21 *		Robert Olsson	:	Removed skb_head_pool
  22 *
  23 *	NOTE:
  24 *		The __skb_ routines should be called with interrupts
  25 *	disabled, or you better be *real* sure that the operation is atomic
  26 *	with respect to whatever list is being frobbed (e.g. via lock_sock()
  27 *	or via disabling bottom half handlers, etc).
  28 *
  29 *	This program is free software; you can redistribute it and/or
  30 *	modify it under the terms of the GNU General Public License
  31 *	as published by the Free Software Foundation; either version
  32 *	2 of the License, or (at your option) any later version.
  33 */
  34
  35/*
  36 *	The functions in this file will not compile correctly with gcc 2.4.x
  37 */
  38
  39#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  40
  41#include <linux/module.h>
  42#include <linux/types.h>
  43#include <linux/kernel.h>
  44#include <linux/kmemcheck.h>
  45#include <linux/mm.h>
  46#include <linux/interrupt.h>
  47#include <linux/in.h>
  48#include <linux/inet.h>
  49#include <linux/slab.h>
  50#include <linux/tcp.h>
  51#include <linux/udp.h>
  52#include <linux/sctp.h>
  53#include <linux/netdevice.h>
  54#ifdef CONFIG_NET_CLS_ACT
  55#include <net/pkt_sched.h>
  56#endif
  57#include <linux/string.h>
  58#include <linux/skbuff.h>
  59#include <linux/splice.h>
  60#include <linux/cache.h>
  61#include <linux/rtnetlink.h>
  62#include <linux/init.h>
  63#include <linux/scatterlist.h>
  64#include <linux/errqueue.h>
  65#include <linux/prefetch.h>
  66#include <linux/if_vlan.h>
  67
  68#include <net/protocol.h>
  69#include <net/dst.h>
  70#include <net/sock.h>
  71#include <net/checksum.h>
  72#include <net/ip6_checksum.h>
  73#include <net/xfrm.h>
  74
  75#include <linux/uaccess.h>
  76#include <trace/events/skb.h>
  77#include <linux/highmem.h>
  78#include <linux/capability.h>
  79#include <linux/user_namespace.h>
  80
  81struct kmem_cache *skbuff_head_cache __read_mostly;
  82static struct kmem_cache *skbuff_fclone_cache __read_mostly;
  83int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS;
  84EXPORT_SYMBOL(sysctl_max_skb_frags);
  85
  86/**
  87 *	skb_panic - private function for out-of-line support
  88 *	@skb:	buffer
  89 *	@sz:	size
  90 *	@addr:	address
  91 *	@msg:	skb_over_panic or skb_under_panic
  92 *
  93 *	Out-of-line support for skb_put() and skb_push().
  94 *	Called via the wrapper skb_over_panic() or skb_under_panic().
  95 *	Keep out of line to prevent kernel bloat.
  96 *	__builtin_return_address is not used because it is not always reliable.
  97 */
  98static void skb_panic(struct sk_buff *skb, unsigned int sz, void *addr,
  99		      const char msg[])
 100{
 101	pr_emerg("%s: text:%p len:%d put:%d head:%p data:%p tail:%#lx end:%#lx dev:%s\n",
 102		 msg, addr, skb->len, sz, skb->head, skb->data,
 103		 (unsigned long)skb->tail, (unsigned long)skb->end,
 104		 skb->dev ? skb->dev->name : "<NULL>");
 105	BUG();
 106}
 107
 108static void skb_over_panic(struct sk_buff *skb, unsigned int sz, void *addr)
 109{
 110	skb_panic(skb, sz, addr, __func__);
 111}
 112
 113static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr)
 114{
 115	skb_panic(skb, sz, addr, __func__);
 116}
 117
 118/*
 119 * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells
 120 * the caller if emergency pfmemalloc reserves are being used. If it is and
 121 * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves
 122 * may be used. Otherwise, the packet data may be discarded until enough
 123 * memory is free
 124 */
 125#define kmalloc_reserve(size, gfp, node, pfmemalloc) \
 126	 __kmalloc_reserve(size, gfp, node, _RET_IP_, pfmemalloc)
 127
 128static void *__kmalloc_reserve(size_t size, gfp_t flags, int node,
 129			       unsigned long ip, bool *pfmemalloc)
 130{
 131	void *obj;
 132	bool ret_pfmemalloc = false;
 133
 134	/*
 135	 * Try a regular allocation, when that fails and we're not entitled
 136	 * to the reserves, fail.
 137	 */
 138	obj = kmalloc_node_track_caller(size,
 139					flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
 140					node);
 141	if (obj || !(gfp_pfmemalloc_allowed(flags)))
 142		goto out;
 143
 144	/* Try again but now we are using pfmemalloc reserves */
 145	ret_pfmemalloc = true;
 146	obj = kmalloc_node_track_caller(size, flags, node);
 147
 148out:
 149	if (pfmemalloc)
 150		*pfmemalloc = ret_pfmemalloc;
 151
 152	return obj;
 153}
 154
 155/* 	Allocate a new skbuff. We do this ourselves so we can fill in a few
 156 *	'private' fields and also do memory statistics to find all the
 157 *	[BEEP] leaks.
 158 *
 159 */
 160
 161struct sk_buff *__alloc_skb_head(gfp_t gfp_mask, int node)
 162{
 163	struct sk_buff *skb;
 164
 165	/* Get the HEAD */
 166	skb = kmem_cache_alloc_node(skbuff_head_cache,
 167				    gfp_mask & ~__GFP_DMA, node);
 168	if (!skb)
 169		goto out;
 170
 171	/*
 172	 * Only clear those fields we need to clear, not those that we will
 173	 * actually initialise below. Hence, don't put any more fields after
 174	 * the tail pointer in struct sk_buff!
 175	 */
 176	memset(skb, 0, offsetof(struct sk_buff, tail));
 177	skb->head = NULL;
 178	skb->truesize = sizeof(struct sk_buff);
 179	atomic_set(&skb->users, 1);
 180
 181	skb->mac_header = (typeof(skb->mac_header))~0U;
 182out:
 183	return skb;
 184}
 185
 186/**
 187 *	__alloc_skb	-	allocate a network buffer
 188 *	@size: size to allocate
 189 *	@gfp_mask: allocation mask
 190 *	@flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache
 191 *		instead of head cache and allocate a cloned (child) skb.
 192 *		If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for
 193 *		allocations in case the data is required for writeback
 194 *	@node: numa node to allocate memory on
 195 *
 196 *	Allocate a new &sk_buff. The returned buffer has no headroom and a
 197 *	tail room of at least size bytes. The object has a reference count
 198 *	of one. The return is the buffer. On a failure the return is %NULL.
 199 *
 200 *	Buffers may only be allocated from interrupts using a @gfp_mask of
 201 *	%GFP_ATOMIC.
 202 */
 203struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
 204			    int flags, int node)
 205{
 206	struct kmem_cache *cache;
 207	struct skb_shared_info *shinfo;
 208	struct sk_buff *skb;
 209	u8 *data;
 210	bool pfmemalloc;
 211
 212	cache = (flags & SKB_ALLOC_FCLONE)
 213		? skbuff_fclone_cache : skbuff_head_cache;
 214
 215	if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX))
 216		gfp_mask |= __GFP_MEMALLOC;
 217
 218	/* Get the HEAD */
 219	skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node);
 220	if (!skb)
 221		goto out;
 222	prefetchw(skb);
 223
 224	/* We do our best to align skb_shared_info on a separate cache
 225	 * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives
 226	 * aligned memory blocks, unless SLUB/SLAB debug is enabled.
 227	 * Both skb->head and skb_shared_info are cache line aligned.
 228	 */
 229	size = SKB_DATA_ALIGN(size);
 230	size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 231	data = kmalloc_reserve(size, gfp_mask, node, &pfmemalloc);
 232	if (!data)
 233		goto nodata;
 234	/* kmalloc(size) might give us more room than requested.
 235	 * Put skb_shared_info exactly at the end of allocated zone,
 236	 * to allow max possible filling before reallocation.
 237	 */
 238	size = SKB_WITH_OVERHEAD(ksize(data));
 239	prefetchw(data + size);
 240
 241	/*
 242	 * Only clear those fields we need to clear, not those that we will
 243	 * actually initialise below. Hence, don't put any more fields after
 244	 * the tail pointer in struct sk_buff!
 245	 */
 246	memset(skb, 0, offsetof(struct sk_buff, tail));
 247	/* Account for allocated memory : skb + skb->head */
 248	skb->truesize = SKB_TRUESIZE(size);
 249	skb->pfmemalloc = pfmemalloc;
 250	atomic_set(&skb->users, 1);
 251	skb->head = data;
 252	skb->data = data;
 253	skb_reset_tail_pointer(skb);
 254	skb->end = skb->tail + size;
 255	skb->mac_header = (typeof(skb->mac_header))~0U;
 256	skb->transport_header = (typeof(skb->transport_header))~0U;
 257
 258	/* make sure we initialize shinfo sequentially */
 259	shinfo = skb_shinfo(skb);
 260	memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
 261	atomic_set(&shinfo->dataref, 1);
 262	kmemcheck_annotate_variable(shinfo->destructor_arg);
 263
 264	if (flags & SKB_ALLOC_FCLONE) {
 265		struct sk_buff_fclones *fclones;
 266
 267		fclones = container_of(skb, struct sk_buff_fclones, skb1);
 268
 269		kmemcheck_annotate_bitfield(&fclones->skb2, flags1);
 270		skb->fclone = SKB_FCLONE_ORIG;
 271		atomic_set(&fclones->fclone_ref, 1);
 272
 273		fclones->skb2.fclone = SKB_FCLONE_CLONE;
 274	}
 275out:
 276	return skb;
 277nodata:
 278	kmem_cache_free(cache, skb);
 279	skb = NULL;
 280	goto out;
 281}
 282EXPORT_SYMBOL(__alloc_skb);
 283
 284/**
 285 * __build_skb - build a network buffer
 286 * @data: data buffer provided by caller
 287 * @frag_size: size of data, or 0 if head was kmalloced
 288 *
 289 * Allocate a new &sk_buff. Caller provides space holding head and
 290 * skb_shared_info. @data must have been allocated by kmalloc() only if
 291 * @frag_size is 0, otherwise data should come from the page allocator
 292 *  or vmalloc()
 293 * The return is the new skb buffer.
 294 * On a failure the return is %NULL, and @data is not freed.
 295 * Notes :
 296 *  Before IO, driver allocates only data buffer where NIC put incoming frame
 297 *  Driver should add room at head (NET_SKB_PAD) and
 298 *  MUST add room at tail (SKB_DATA_ALIGN(skb_shared_info))
 299 *  After IO, driver calls build_skb(), to allocate sk_buff and populate it
 300 *  before giving packet to stack.
 301 *  RX rings only contains data buffers, not full skbs.
 302 */
 303struct sk_buff *__build_skb(void *data, unsigned int frag_size)
 304{
 305	struct skb_shared_info *shinfo;
 306	struct sk_buff *skb;
 307	unsigned int size = frag_size ? : ksize(data);
 308
 309	skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC);
 310	if (!skb)
 311		return NULL;
 312
 313	size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 314
 315	memset(skb, 0, offsetof(struct sk_buff, tail));
 316	skb->truesize = SKB_TRUESIZE(size);
 317	atomic_set(&skb->users, 1);
 318	skb->head = data;
 319	skb->data = data;
 320	skb_reset_tail_pointer(skb);
 321	skb->end = skb->tail + size;
 322	skb->mac_header = (typeof(skb->mac_header))~0U;
 323	skb->transport_header = (typeof(skb->transport_header))~0U;
 324
 325	/* make sure we initialize shinfo sequentially */
 326	shinfo = skb_shinfo(skb);
 327	memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
 328	atomic_set(&shinfo->dataref, 1);
 329	kmemcheck_annotate_variable(shinfo->destructor_arg);
 330
 331	return skb;
 332}
 333
 334/* build_skb() is wrapper over __build_skb(), that specifically
 335 * takes care of skb->head and skb->pfmemalloc
 336 * This means that if @frag_size is not zero, then @data must be backed
 337 * by a page fragment, not kmalloc() or vmalloc()
 338 */
 339struct sk_buff *build_skb(void *data, unsigned int frag_size)
 340{
 341	struct sk_buff *skb = __build_skb(data, frag_size);
 342
 343	if (skb && frag_size) {
 344		skb->head_frag = 1;
 345		if (page_is_pfmemalloc(virt_to_head_page(data)))
 346			skb->pfmemalloc = 1;
 347	}
 348	return skb;
 349}
 350EXPORT_SYMBOL(build_skb);
 351
 352#define NAPI_SKB_CACHE_SIZE	64
 353
 354struct napi_alloc_cache {
 355	struct page_frag_cache page;
 356	unsigned int skb_count;
 357	void *skb_cache[NAPI_SKB_CACHE_SIZE];
 358};
 359
 360static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
 361static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
 362
 363static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
 364{
 365	struct page_frag_cache *nc;
 366	unsigned long flags;
 367	void *data;
 368
 369	local_irq_save(flags);
 370	nc = this_cpu_ptr(&netdev_alloc_cache);
 371	data = page_frag_alloc(nc, fragsz, gfp_mask);
 372	local_irq_restore(flags);
 373	return data;
 374}
 375
 376/**
 377 * netdev_alloc_frag - allocate a page fragment
 378 * @fragsz: fragment size
 379 *
 380 * Allocates a frag from a page for receive buffer.
 381 * Uses GFP_ATOMIC allocations.
 382 */
 383void *netdev_alloc_frag(unsigned int fragsz)
 384{
 385	return __netdev_alloc_frag(fragsz, GFP_ATOMIC | __GFP_COLD);
 386}
 387EXPORT_SYMBOL(netdev_alloc_frag);
 388
 389static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
 390{
 391	struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
 392
 393	return page_frag_alloc(&nc->page, fragsz, gfp_mask);
 394}
 395
 396void *napi_alloc_frag(unsigned int fragsz)
 397{
 398	return __napi_alloc_frag(fragsz, GFP_ATOMIC | __GFP_COLD);
 399}
 400EXPORT_SYMBOL(napi_alloc_frag);
 401
 402/**
 403 *	__netdev_alloc_skb - allocate an skbuff for rx on a specific device
 404 *	@dev: network device to receive on
 405 *	@len: length to allocate
 406 *	@gfp_mask: get_free_pages mask, passed to alloc_skb
 407 *
 408 *	Allocate a new &sk_buff and assign it a usage count of one. The
 409 *	buffer has NET_SKB_PAD headroom built in. Users should allocate
 410 *	the headroom they think they need without accounting for the
 411 *	built in space. The built in space is used for optimisations.
 412 *
 413 *	%NULL is returned if there is no free memory.
 414 */
 415struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
 416				   gfp_t gfp_mask)
 417{
 418	struct page_frag_cache *nc;
 419	unsigned long flags;
 420	struct sk_buff *skb;
 421	bool pfmemalloc;
 422	void *data;
 423
 424	len += NET_SKB_PAD;
 425
 426	if ((len > SKB_WITH_OVERHEAD(PAGE_SIZE)) ||
 427	    (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
 428		skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
 429		if (!skb)
 430			goto skb_fail;
 431		goto skb_success;
 432	}
 433
 434	len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 435	len = SKB_DATA_ALIGN(len);
 436
 437	if (sk_memalloc_socks())
 438		gfp_mask |= __GFP_MEMALLOC;
 439
 440	local_irq_save(flags);
 441
 442	nc = this_cpu_ptr(&netdev_alloc_cache);
 443	data = page_frag_alloc(nc, len, gfp_mask);
 444	pfmemalloc = nc->pfmemalloc;
 445
 446	local_irq_restore(flags);
 447
 448	if (unlikely(!data))
 449		return NULL;
 450
 451	skb = __build_skb(data, len);
 452	if (unlikely(!skb)) {
 453		skb_free_frag(data);
 454		return NULL;
 455	}
 456
 457	/* use OR instead of assignment to avoid clearing of bits in mask */
 458	if (pfmemalloc)
 459		skb->pfmemalloc = 1;
 460	skb->head_frag = 1;
 461
 462skb_success:
 463	skb_reserve(skb, NET_SKB_PAD);
 464	skb->dev = dev;
 465
 466skb_fail:
 467	return skb;
 468}
 469EXPORT_SYMBOL(__netdev_alloc_skb);
 470
 471/**
 472 *	__napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance
 473 *	@napi: napi instance this buffer was allocated for
 474 *	@len: length to allocate
 475 *	@gfp_mask: get_free_pages mask, passed to alloc_skb and alloc_pages
 476 *
 477 *	Allocate a new sk_buff for use in NAPI receive.  This buffer will
 478 *	attempt to allocate the head from a special reserved region used
 479 *	only for NAPI Rx allocation.  By doing this we can save several
 480 *	CPU cycles by avoiding having to disable and re-enable IRQs.
 481 *
 482 *	%NULL is returned if there is no free memory.
 483 */
 484struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
 485				 gfp_t gfp_mask)
 486{
 487	struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
 488	struct sk_buff *skb;
 489	void *data;
 490
 491	len += NET_SKB_PAD + NET_IP_ALIGN;
 492
 493	if ((len > SKB_WITH_OVERHEAD(PAGE_SIZE)) ||
 494	    (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
 495		skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
 496		if (!skb)
 497			goto skb_fail;
 498		goto skb_success;
 499	}
 500
 501	len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 502	len = SKB_DATA_ALIGN(len);
 503
 504	if (sk_memalloc_socks())
 505		gfp_mask |= __GFP_MEMALLOC;
 506
 507	data = page_frag_alloc(&nc->page, len, gfp_mask);
 508	if (unlikely(!data))
 509		return NULL;
 510
 511	skb = __build_skb(data, len);
 512	if (unlikely(!skb)) {
 513		skb_free_frag(data);
 514		return NULL;
 515	}
 516
 517	/* use OR instead of assignment to avoid clearing of bits in mask */
 518	if (nc->page.pfmemalloc)
 519		skb->pfmemalloc = 1;
 520	skb->head_frag = 1;
 521
 522skb_success:
 523	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
 524	skb->dev = napi->dev;
 525
 526skb_fail:
 527	return skb;
 528}
 529EXPORT_SYMBOL(__napi_alloc_skb);
 530
 531void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
 532		     int size, unsigned int truesize)
 533{
 534	skb_fill_page_desc(skb, i, page, off, size);
 535	skb->len += size;
 536	skb->data_len += size;
 537	skb->truesize += truesize;
 538}
 539EXPORT_SYMBOL(skb_add_rx_frag);
 540
 541void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size,
 542			  unsigned int truesize)
 543{
 544	skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
 545
 546	skb_frag_size_add(frag, size);
 547	skb->len += size;
 548	skb->data_len += size;
 549	skb->truesize += truesize;
 550}
 551EXPORT_SYMBOL(skb_coalesce_rx_frag);
 552
 553static void skb_drop_list(struct sk_buff **listp)
 554{
 555	kfree_skb_list(*listp);
 556	*listp = NULL;
 557}
 558
 559static inline void skb_drop_fraglist(struct sk_buff *skb)
 560{
 561	skb_drop_list(&skb_shinfo(skb)->frag_list);
 562}
 563
 564static void skb_clone_fraglist(struct sk_buff *skb)
 565{
 566	struct sk_buff *list;
 567
 568	skb_walk_frags(skb, list)
 569		skb_get(list);
 570}
 571
 572static void skb_free_head(struct sk_buff *skb)
 573{
 574	unsigned char *head = skb->head;
 575
 576	if (skb->head_frag)
 577		skb_free_frag(head);
 578	else
 579		kfree(head);
 580}
 581
 582static void skb_release_data(struct sk_buff *skb)
 583{
 584	struct skb_shared_info *shinfo = skb_shinfo(skb);
 585	int i;
 586
 587	if (skb->cloned &&
 588	    atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
 589			      &shinfo->dataref))
 590		return;
 591
 592	for (i = 0; i < shinfo->nr_frags; i++)
 593		__skb_frag_unref(&shinfo->frags[i]);
 594
 595	/*
 596	 * If skb buf is from userspace, we need to notify the caller
 597	 * the lower device DMA has done;
 598	 */
 599	if (shinfo->tx_flags & SKBTX_DEV_ZEROCOPY) {
 600		struct ubuf_info *uarg;
 601
 602		uarg = shinfo->destructor_arg;
 603		if (uarg->callback)
 604			uarg->callback(uarg, true);
 605	}
 606
 607	if (shinfo->frag_list)
 608		kfree_skb_list(shinfo->frag_list);
 609
 610	skb_free_head(skb);
 611}
 612
 613/*
 614 *	Free an skbuff by memory without cleaning the state.
 615 */
 616static void kfree_skbmem(struct sk_buff *skb)
 617{
 618	struct sk_buff_fclones *fclones;
 619
 620	switch (skb->fclone) {
 621	case SKB_FCLONE_UNAVAILABLE:
 622		kmem_cache_free(skbuff_head_cache, skb);
 623		return;
 624
 625	case SKB_FCLONE_ORIG:
 626		fclones = container_of(skb, struct sk_buff_fclones, skb1);
 627
 628		/* We usually free the clone (TX completion) before original skb
 629		 * This test would have no chance to be true for the clone,
 630		 * while here, branch prediction will be good.
 631		 */
 632		if (atomic_read(&fclones->fclone_ref) == 1)
 633			goto fastpath;
 634		break;
 635
 636	default: /* SKB_FCLONE_CLONE */
 637		fclones = container_of(skb, struct sk_buff_fclones, skb2);
 638		break;
 639	}
 640	if (!atomic_dec_and_test(&fclones->fclone_ref))
 641		return;
 642fastpath:
 643	kmem_cache_free(skbuff_fclone_cache, fclones);
 644}
 645
 646static void skb_release_head_state(struct sk_buff *skb)
 647{
 648	skb_dst_drop(skb);
 649#ifdef CONFIG_XFRM
 650	secpath_put(skb->sp);
 651#endif
 652	if (skb->destructor) {
 653		WARN_ON(in_irq());
 654		skb->destructor(skb);
 655	}
 656#if IS_ENABLED(CONFIG_NF_CONNTRACK)
 657	nf_conntrack_put(skb_nfct(skb));
 658#endif
 659#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
 660	nf_bridge_put(skb->nf_bridge);
 661#endif
 662}
 663
 664/* Free everything but the sk_buff shell. */
 665static void skb_release_all(struct sk_buff *skb)
 666{
 667	skb_release_head_state(skb);
 668	if (likely(skb->head))
 669		skb_release_data(skb);
 670}
 671
 672/**
 673 *	__kfree_skb - private function
 674 *	@skb: buffer
 675 *
 676 *	Free an sk_buff. Release anything attached to the buffer.
 677 *	Clean the state. This is an internal helper function. Users should
 678 *	always call kfree_skb
 679 */
 680
 681void __kfree_skb(struct sk_buff *skb)
 682{
 683	skb_release_all(skb);
 684	kfree_skbmem(skb);
 685}
 686EXPORT_SYMBOL(__kfree_skb);
 687
 688/**
 689 *	kfree_skb - free an sk_buff
 690 *	@skb: buffer to free
 691 *
 692 *	Drop a reference to the buffer and free it if the usage count has
 693 *	hit zero.
 694 */
 695void kfree_skb(struct sk_buff *skb)
 696{
 697	if (unlikely(!skb))
 698		return;
 699	if (likely(atomic_read(&skb->users) == 1))
 700		smp_rmb();
 701	else if (likely(!atomic_dec_and_test(&skb->users)))
 702		return;
 703	trace_kfree_skb(skb, __builtin_return_address(0));
 704	__kfree_skb(skb);
 705}
 706EXPORT_SYMBOL(kfree_skb);
 707
 708void kfree_skb_list(struct sk_buff *segs)
 709{
 710	while (segs) {
 711		struct sk_buff *next = segs->next;
 712
 713		kfree_skb(segs);
 714		segs = next;
 715	}
 716}
 717EXPORT_SYMBOL(kfree_skb_list);
 718
 719/**
 720 *	skb_tx_error - report an sk_buff xmit error
 721 *	@skb: buffer that triggered an error
 722 *
 723 *	Report xmit error if a device callback is tracking this skb.
 724 *	skb must be freed afterwards.
 725 */
 726void skb_tx_error(struct sk_buff *skb)
 727{
 728	if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
 729		struct ubuf_info *uarg;
 730
 731		uarg = skb_shinfo(skb)->destructor_arg;
 732		if (uarg->callback)
 733			uarg->callback(uarg, false);
 734		skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY;
 735	}
 736}
 737EXPORT_SYMBOL(skb_tx_error);
 738
 739/**
 740 *	consume_skb - free an skbuff
 741 *	@skb: buffer to free
 742 *
 743 *	Drop a ref to the buffer and free it if the usage count has hit zero
 744 *	Functions identically to kfree_skb, but kfree_skb assumes that the frame
 745 *	is being dropped after a failure and notes that
 746 */
 747void consume_skb(struct sk_buff *skb)
 748{
 749	if (unlikely(!skb))
 750		return;
 751	if (likely(atomic_read(&skb->users) == 1))
 752		smp_rmb();
 753	else if (likely(!atomic_dec_and_test(&skb->users)))
 754		return;
 755	trace_consume_skb(skb);
 756	__kfree_skb(skb);
 757}
 758EXPORT_SYMBOL(consume_skb);
 759
 760void __kfree_skb_flush(void)
 761{
 762	struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
 763
 764	/* flush skb_cache if containing objects */
 765	if (nc->skb_count) {
 766		kmem_cache_free_bulk(skbuff_head_cache, nc->skb_count,
 767				     nc->skb_cache);
 768		nc->skb_count = 0;
 769	}
 770}
 771
 772static inline void _kfree_skb_defer(struct sk_buff *skb)
 773{
 774	struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
 775
 776	/* drop skb->head and call any destructors for packet */
 777	skb_release_all(skb);
 778
 779	/* record skb to CPU local list */
 780	nc->skb_cache[nc->skb_count++] = skb;
 781
 782#ifdef CONFIG_SLUB
 783	/* SLUB writes into objects when freeing */
 784	prefetchw(skb);
 785#endif
 786
 787	/* flush skb_cache if it is filled */
 788	if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) {
 789		kmem_cache_free_bulk(skbuff_head_cache, NAPI_SKB_CACHE_SIZE,
 790				     nc->skb_cache);
 791		nc->skb_count = 0;
 792	}
 793}
 794void __kfree_skb_defer(struct sk_buff *skb)
 795{
 796	_kfree_skb_defer(skb);
 797}
 798
 799void napi_consume_skb(struct sk_buff *skb, int budget)
 800{
 801	if (unlikely(!skb))
 802		return;
 803
 804	/* Zero budget indicate non-NAPI context called us, like netpoll */
 805	if (unlikely(!budget)) {
 806		dev_consume_skb_any(skb);
 807		return;
 808	}
 809
 810	if (likely(atomic_read(&skb->users) == 1))
 811		smp_rmb();
 812	else if (likely(!atomic_dec_and_test(&skb->users)))
 813		return;
 814	/* if reaching here SKB is ready to free */
 815	trace_consume_skb(skb);
 816
 817	/* if SKB is a clone, don't handle this case */
 818	if (skb->fclone != SKB_FCLONE_UNAVAILABLE) {
 819		__kfree_skb(skb);
 820		return;
 821	}
 822
 823	_kfree_skb_defer(skb);
 824}
 825EXPORT_SYMBOL(napi_consume_skb);
 826
 827/* Make sure a field is enclosed inside headers_start/headers_end section */
 828#define CHECK_SKB_FIELD(field) \
 829	BUILD_BUG_ON(offsetof(struct sk_buff, field) <		\
 830		     offsetof(struct sk_buff, headers_start));	\
 831	BUILD_BUG_ON(offsetof(struct sk_buff, field) >		\
 832		     offsetof(struct sk_buff, headers_end));	\
 833
 834static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
 835{
 836	new->tstamp		= old->tstamp;
 837	/* We do not copy old->sk */
 838	new->dev		= old->dev;
 839	memcpy(new->cb, old->cb, sizeof(old->cb));
 840	skb_dst_copy(new, old);
 841#ifdef CONFIG_XFRM
 842	new->sp			= secpath_get(old->sp);
 843#endif
 844	__nf_copy(new, old, false);
 845
 846	/* Note : this field could be in headers_start/headers_end section
 847	 * It is not yet because we do not want to have a 16 bit hole
 848	 */
 849	new->queue_mapping = old->queue_mapping;
 850
 851	memcpy(&new->headers_start, &old->headers_start,
 852	       offsetof(struct sk_buff, headers_end) -
 853	       offsetof(struct sk_buff, headers_start));
 854	CHECK_SKB_FIELD(protocol);
 855	CHECK_SKB_FIELD(csum);
 856	CHECK_SKB_FIELD(hash);
 857	CHECK_SKB_FIELD(priority);
 858	CHECK_SKB_FIELD(skb_iif);
 859	CHECK_SKB_FIELD(vlan_proto);
 860	CHECK_SKB_FIELD(vlan_tci);
 861	CHECK_SKB_FIELD(transport_header);
 862	CHECK_SKB_FIELD(network_header);
 863	CHECK_SKB_FIELD(mac_header);
 864	CHECK_SKB_FIELD(inner_protocol);
 865	CHECK_SKB_FIELD(inner_transport_header);
 866	CHECK_SKB_FIELD(inner_network_header);
 867	CHECK_SKB_FIELD(inner_mac_header);
 868	CHECK_SKB_FIELD(mark);
 869#ifdef CONFIG_NETWORK_SECMARK
 870	CHECK_SKB_FIELD(secmark);
 871#endif
 872#ifdef CONFIG_NET_RX_BUSY_POLL
 873	CHECK_SKB_FIELD(napi_id);
 874#endif
 875#ifdef CONFIG_XPS
 876	CHECK_SKB_FIELD(sender_cpu);
 877#endif
 878#ifdef CONFIG_NET_SCHED
 879	CHECK_SKB_FIELD(tc_index);
 880#endif
 881
 882}
 883
 884/*
 885 * You should not add any new code to this function.  Add it to
 886 * __copy_skb_header above instead.
 887 */
 888static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
 889{
 890#define C(x) n->x = skb->x
 891
 892	n->next = n->prev = NULL;
 893	n->sk = NULL;
 894	__copy_skb_header(n, skb);
 895
 896	C(len);
 897	C(data_len);
 898	C(mac_len);
 899	n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len;
 900	n->cloned = 1;
 901	n->nohdr = 0;
 902	n->destructor = NULL;
 903	C(tail);
 904	C(end);
 905	C(head);
 906	C(head_frag);
 907	C(data);
 908	C(truesize);
 909	atomic_set(&n->users, 1);
 910
 911	atomic_inc(&(skb_shinfo(skb)->dataref));
 912	skb->cloned = 1;
 913
 914	return n;
 915#undef C
 916}
 917
 918/**
 919 *	skb_morph	-	morph one skb into another
 920 *	@dst: the skb to receive the contents
 921 *	@src: the skb to supply the contents
 922 *
 923 *	This is identical to skb_clone except that the target skb is
 924 *	supplied by the user.
 925 *
 926 *	The target skb is returned upon exit.
 927 */
 928struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
 929{
 930	skb_release_all(dst);
 931	return __skb_clone(dst, src);
 932}
 933EXPORT_SYMBOL_GPL(skb_morph);
 934
 935/**
 936 *	skb_copy_ubufs	-	copy userspace skb frags buffers to kernel
 937 *	@skb: the skb to modify
 938 *	@gfp_mask: allocation priority
 939 *
 940 *	This must be called on SKBTX_DEV_ZEROCOPY skb.
 941 *	It will copy all frags into kernel and drop the reference
 942 *	to userspace pages.
 943 *
 944 *	If this function is called from an interrupt gfp_mask() must be
 945 *	%GFP_ATOMIC.
 946 *
 947 *	Returns 0 on success or a negative error code on failure
 948 *	to allocate kernel memory to copy to.
 949 */
 950int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
 951{
 952	int i;
 953	int num_frags = skb_shinfo(skb)->nr_frags;
 954	struct page *page, *head = NULL;
 955	struct ubuf_info *uarg = skb_shinfo(skb)->destructor_arg;
 956
 957	for (i = 0; i < num_frags; i++) {
 958		u8 *vaddr;
 959		skb_frag_t *f = &skb_shinfo(skb)->frags[i];
 960
 961		page = alloc_page(gfp_mask);
 962		if (!page) {
 963			while (head) {
 964				struct page *next = (struct page *)page_private(head);
 965				put_page(head);
 966				head = next;
 967			}
 968			return -ENOMEM;
 969		}
 970		vaddr = kmap_atomic(skb_frag_page(f));
 971		memcpy(page_address(page),
 972		       vaddr + f->page_offset, skb_frag_size(f));
 973		kunmap_atomic(vaddr);
 974		set_page_private(page, (unsigned long)head);
 975		head = page;
 976	}
 977
 978	/* skb frags release userspace buffers */
 979	for (i = 0; i < num_frags; i++)
 980		skb_frag_unref(skb, i);
 981
 982	uarg->callback(uarg, false);
 983
 984	/* skb frags point to kernel buffers */
 985	for (i = num_frags - 1; i >= 0; i--) {
 986		__skb_fill_page_desc(skb, i, head, 0,
 987				     skb_shinfo(skb)->frags[i].size);
 988		head = (struct page *)page_private(head);
 989	}
 990
 991	skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY;
 992	return 0;
 993}
 994EXPORT_SYMBOL_GPL(skb_copy_ubufs);
 995
 996/**
 997 *	skb_clone	-	duplicate an sk_buff
 998 *	@skb: buffer to clone
 999 *	@gfp_mask: allocation priority
1000 *
1001 *	Duplicate an &sk_buff. The new one is not owned by a socket. Both
1002 *	copies share the same packet data but not structure. The new
1003 *	buffer has a reference count of 1. If the allocation fails the
1004 *	function returns %NULL otherwise the new buffer is returned.
1005 *
1006 *	If this function is called from an interrupt gfp_mask() must be
1007 *	%GFP_ATOMIC.
1008 */
1009
1010struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
1011{
1012	struct sk_buff_fclones *fclones = container_of(skb,
1013						       struct sk_buff_fclones,
1014						       skb1);
1015	struct sk_buff *n;
1016
1017	if (skb_orphan_frags(skb, gfp_mask))
1018		return NULL;
1019
1020	if (skb->fclone == SKB_FCLONE_ORIG &&
1021	    atomic_read(&fclones->fclone_ref) == 1) {
1022		n = &fclones->skb2;
1023		atomic_set(&fclones->fclone_ref, 2);
1024	} else {
1025		if (skb_pfmemalloc(skb))
1026			gfp_mask |= __GFP_MEMALLOC;
1027
1028		n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
1029		if (!n)
1030			return NULL;
1031
1032		kmemcheck_annotate_bitfield(n, flags1);
1033		n->fclone = SKB_FCLONE_UNAVAILABLE;
1034	}
1035
1036	return __skb_clone(n, skb);
1037}
1038EXPORT_SYMBOL(skb_clone);
1039
1040static void skb_headers_offset_update(struct sk_buff *skb, int off)
1041{
1042	/* Only adjust this if it actually is csum_start rather than csum */
1043	if (skb->ip_summed == CHECKSUM_PARTIAL)
1044		skb->csum_start += off;
1045	/* {transport,network,mac}_header and tail are relative to skb->head */
1046	skb->transport_header += off;
1047	skb->network_header   += off;
1048	if (skb_mac_header_was_set(skb))
1049		skb->mac_header += off;
1050	skb->inner_transport_header += off;
1051	skb->inner_network_header += off;
1052	skb->inner_mac_header += off;
1053}
1054
1055static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
1056{
1057	__copy_skb_header(new, old);
1058
1059	skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size;
1060	skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs;
1061	skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type;
1062}
1063
1064static inline int skb_alloc_rx_flag(const struct sk_buff *skb)
1065{
1066	if (skb_pfmemalloc(skb))
1067		return SKB_ALLOC_RX;
1068	return 0;
1069}
1070
1071/**
1072 *	skb_copy	-	create private copy of an sk_buff
1073 *	@skb: buffer to copy
1074 *	@gfp_mask: allocation priority
1075 *
1076 *	Make a copy of both an &sk_buff and its data. This is used when the
1077 *	caller wishes to modify the data and needs a private copy of the
1078 *	data to alter. Returns %NULL on failure or the pointer to the buffer
1079 *	on success. The returned buffer has a reference count of 1.
1080 *
1081 *	As by-product this function converts non-linear &sk_buff to linear
1082 *	one, so that &sk_buff becomes completely private and caller is allowed
1083 *	to modify all the data of returned buffer. This means that this
1084 *	function is not recommended for use in circumstances when only
1085 *	header is going to be modified. Use pskb_copy() instead.
1086 */
1087
1088struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
1089{
1090	int headerlen = skb_headroom(skb);
1091	unsigned int size = skb_end_offset(skb) + skb->data_len;
1092	struct sk_buff *n = __alloc_skb(size, gfp_mask,
1093					skb_alloc_rx_flag(skb), NUMA_NO_NODE);
1094
1095	if (!n)
1096		return NULL;
1097
1098	/* Set the data pointer */
1099	skb_reserve(n, headerlen);
1100	/* Set the tail pointer and length */
1101	skb_put(n, skb->len);
1102
1103	if (skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len))
1104		BUG();
1105
1106	copy_skb_header(n, skb);
1107	return n;
1108}
1109EXPORT_SYMBOL(skb_copy);
1110
1111/**
1112 *	__pskb_copy_fclone	-  create copy of an sk_buff with private head.
1113 *	@skb: buffer to copy
1114 *	@headroom: headroom of new skb
1115 *	@gfp_mask: allocation priority
1116 *	@fclone: if true allocate the copy of the skb from the fclone
1117 *	cache instead of the head cache; it is recommended to set this
1118 *	to true for the cases where the copy will likely be cloned
1119 *
1120 *	Make a copy of both an &sk_buff and part of its data, located
1121 *	in header. Fragmented data remain shared. This is used when
1122 *	the caller wishes to modify only header of &sk_buff and needs
1123 *	private copy of the header to alter. Returns %NULL on failure
1124 *	or the pointer to the buffer on success.
1125 *	The returned buffer has a reference count of 1.
1126 */
1127
1128struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom,
1129				   gfp_t gfp_mask, bool fclone)
1130{
1131	unsigned int size = skb_headlen(skb) + headroom;
1132	int flags = skb_alloc_rx_flag(skb) | (fclone ? SKB_ALLOC_FCLONE : 0);
1133	struct sk_buff *n = __alloc_skb(size, gfp_mask, flags, NUMA_NO_NODE);
1134
1135	if (!n)
1136		goto out;
1137
1138	/* Set the data pointer */
1139	skb_reserve(n, headroom);
1140	/* Set the tail pointer and length */
1141	skb_put(n, skb_headlen(skb));
1142	/* Copy the bytes */
1143	skb_copy_from_linear_data(skb, n->data, n->len);
1144
1145	n->truesize += skb->data_len;
1146	n->data_len  = skb->data_len;
1147	n->len	     = skb->len;
1148
1149	if (skb_shinfo(skb)->nr_frags) {
1150		int i;
1151
1152		if (skb_orphan_frags(skb, gfp_mask)) {
1153			kfree_skb(n);
1154			n = NULL;
1155			goto out;
1156		}
1157		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1158			skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i];
1159			skb_frag_ref(skb, i);
1160		}
1161		skb_shinfo(n)->nr_frags = i;
1162	}
1163
1164	if (skb_has_frag_list(skb)) {
1165		skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list;
1166		skb_clone_fraglist(n);
1167	}
1168
1169	copy_skb_header(n, skb);
1170out:
1171	return n;
1172}
1173EXPORT_SYMBOL(__pskb_copy_fclone);
1174
1175/**
1176 *	pskb_expand_head - reallocate header of &sk_buff
1177 *	@skb: buffer to reallocate
1178 *	@nhead: room to add at head
1179 *	@ntail: room to add at tail
1180 *	@gfp_mask: allocation priority
1181 *
1182 *	Expands (or creates identical copy, if @nhead and @ntail are zero)
1183 *	header of @skb. &sk_buff itself is not changed. &sk_buff MUST have
1184 *	reference count of 1. Returns zero in the case of success or error,
1185 *	if expansion failed. In the last case, &sk_buff is not changed.
1186 *
1187 *	All the pointers pointing into skb header may change and must be
1188 *	reloaded after call to this function.
1189 */
1190
1191int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
1192		     gfp_t gfp_mask)
1193{
1194	int i, osize = skb_end_offset(skb);
1195	int size = osize + nhead + ntail;
1196	long off;
1197	u8 *data;
1198
1199	BUG_ON(nhead < 0);
1200
1201	if (skb_shared(skb))
1202		BUG();
1203
1204	size = SKB_DATA_ALIGN(size);
1205
1206	if (skb_pfmemalloc(skb))
1207		gfp_mask |= __GFP_MEMALLOC;
1208	data = kmalloc_reserve(size + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)),
1209			       gfp_mask, NUMA_NO_NODE, NULL);
1210	if (!data)
1211		goto nodata;
1212	size = SKB_WITH_OVERHEAD(ksize(data));
1213
1214	/* Copy only real data... and, alas, header. This should be
1215	 * optimized for the cases when header is void.
1216	 */
1217	memcpy(data + nhead, skb->head, skb_tail_pointer(skb) - skb->head);
1218
1219	memcpy((struct skb_shared_info *)(data + size),
1220	       skb_shinfo(skb),
1221	       offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags]));
1222
1223	/*
1224	 * if shinfo is shared we must drop the old head gracefully, but if it
1225	 * is not we can just drop the old head and let the existing refcount
1226	 * be since all we did is relocate the values
1227	 */
1228	if (skb_cloned(skb)) {
1229		/* copy this zero copy skb frags */
1230		if (skb_orphan_frags(skb, gfp_mask))
1231			goto nofrags;
1232		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1233			skb_frag_ref(skb, i);
1234
1235		if (skb_has_frag_list(skb))
1236			skb_clone_fraglist(skb);
1237
1238		skb_release_data(skb);
1239	} else {
1240		skb_free_head(skb);
1241	}
1242	off = (data + nhead) - skb->head;
1243
1244	skb->head     = data;
1245	skb->head_frag = 0;
1246	skb->data    += off;
1247#ifdef NET_SKBUFF_DATA_USES_OFFSET
1248	skb->end      = size;
1249	off           = nhead;
1250#else
1251	skb->end      = skb->head + size;
1252#endif
1253	skb->tail	      += off;
1254	skb_headers_offset_update(skb, nhead);
1255	skb->cloned   = 0;
1256	skb->hdr_len  = 0;
1257	skb->nohdr    = 0;
1258	atomic_set(&skb_shinfo(skb)->dataref, 1);
1259
1260	/* It is not generally safe to change skb->truesize.
1261	 * For the moment, we really care of rx path, or
1262	 * when skb is orphaned (not attached to a socket).
1263	 */
1264	if (!skb->sk || skb->destructor == sock_edemux)
1265		skb->truesize += size - osize;
1266
1267	return 0;
1268
1269nofrags:
1270	kfree(data);
1271nodata:
1272	return -ENOMEM;
1273}
1274EXPORT_SYMBOL(pskb_expand_head);
1275
1276/* Make private copy of skb with writable head and some headroom */
1277
1278struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
1279{
1280	struct sk_buff *skb2;
1281	int delta = headroom - skb_headroom(skb);
1282
1283	if (delta <= 0)
1284		skb2 = pskb_copy(skb, GFP_ATOMIC);
1285	else {
1286		skb2 = skb_clone(skb, GFP_ATOMIC);
1287		if (skb2 && pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0,
1288					     GFP_ATOMIC)) {
1289			kfree_skb(skb2);
1290			skb2 = NULL;
1291		}
1292	}
1293	return skb2;
1294}
1295EXPORT_SYMBOL(skb_realloc_headroom);
1296
1297/**
1298 *	skb_copy_expand	-	copy and expand sk_buff
1299 *	@skb: buffer to copy
1300 *	@newheadroom: new free bytes at head
1301 *	@newtailroom: new free bytes at tail
1302 *	@gfp_mask: allocation priority
1303 *
1304 *	Make a copy of both an &sk_buff and its data and while doing so
1305 *	allocate additional space.
1306 *
1307 *	This is used when the caller wishes to modify the data and needs a
1308 *	private copy of the data to alter as well as more space for new fields.
1309 *	Returns %NULL on failure or the pointer to the buffer
1310 *	on success. The returned buffer has a reference count of 1.
1311 *
1312 *	You must pass %GFP_ATOMIC as the allocation priority if this function
1313 *	is called from an interrupt.
1314 */
1315struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
1316				int newheadroom, int newtailroom,
1317				gfp_t gfp_mask)
1318{
1319	/*
1320	 *	Allocate the copy buffer
1321	 */
1322	struct sk_buff *n = __alloc_skb(newheadroom + skb->len + newtailroom,
1323					gfp_mask, skb_alloc_rx_flag(skb),
1324					NUMA_NO_NODE);
1325	int oldheadroom = skb_headroom(skb);
1326	int head_copy_len, head_copy_off;
1327
1328	if (!n)
1329		return NULL;
1330
1331	skb_reserve(n, newheadroom);
1332
1333	/* Set the tail pointer and length */
1334	skb_put(n, skb->len);
1335
1336	head_copy_len = oldheadroom;
1337	head_copy_off = 0;
1338	if (newheadroom <= head_copy_len)
1339		head_copy_len = newheadroom;
1340	else
1341		head_copy_off = newheadroom - head_copy_len;
1342
1343	/* Copy the linear header and data. */
1344	if (skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off,
1345			  skb->len + head_copy_len))
1346		BUG();
1347
1348	copy_skb_header(n, skb);
1349
1350	skb_headers_offset_update(n, newheadroom - oldheadroom);
1351
1352	return n;
1353}
1354EXPORT_SYMBOL(skb_copy_expand);
1355
1356/**
1357 *	skb_pad			-	zero pad the tail of an skb
1358 *	@skb: buffer to pad
1359 *	@pad: space to pad
1360 *
1361 *	Ensure that a buffer is followed by a padding area that is zero
1362 *	filled. Used by network drivers which may DMA or transfer data
1363 *	beyond the buffer end onto the wire.
1364 *
1365 *	May return error in out of memory cases. The skb is freed on error.
1366 */
1367
1368int skb_pad(struct sk_buff *skb, int pad)
1369{
1370	int err;
1371	int ntail;
1372
1373	/* If the skbuff is non linear tailroom is always zero.. */
1374	if (!skb_cloned(skb) && skb_tailroom(skb) >= pad) {
1375		memset(skb->data+skb->len, 0, pad);
1376		return 0;
1377	}
1378
1379	ntail = skb->data_len + pad - (skb->end - skb->tail);
1380	if (likely(skb_cloned(skb) || ntail > 0)) {
1381		err = pskb_expand_head(skb, 0, ntail, GFP_ATOMIC);
1382		if (unlikely(err))
1383			goto free_skb;
1384	}
1385
1386	/* FIXME: The use of this function with non-linear skb's really needs
1387	 * to be audited.
1388	 */
1389	err = skb_linearize(skb);
1390	if (unlikely(err))
1391		goto free_skb;
1392
1393	memset(skb->data + skb->len, 0, pad);
1394	return 0;
1395
1396free_skb:
1397	kfree_skb(skb);
1398	return err;
1399}
1400EXPORT_SYMBOL(skb_pad);
1401
1402/**
1403 *	pskb_put - add data to the tail of a potentially fragmented buffer
1404 *	@skb: start of the buffer to use
1405 *	@tail: tail fragment of the buffer to use
1406 *	@len: amount of data to add
1407 *
1408 *	This function extends the used data area of the potentially
1409 *	fragmented buffer. @tail must be the last fragment of @skb -- or
1410 *	@skb itself. If this would exceed the total buffer size the kernel
1411 *	will panic. A pointer to the first byte of the extra data is
1412 *	returned.
1413 */
1414
1415unsigned char *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len)
1416{
1417	if (tail != skb) {
1418		skb->data_len += len;
1419		skb->len += len;
1420	}
1421	return skb_put(tail, len);
1422}
1423EXPORT_SYMBOL_GPL(pskb_put);
1424
1425/**
1426 *	skb_put - add data to a buffer
1427 *	@skb: buffer to use
1428 *	@len: amount of data to add
1429 *
1430 *	This function extends the used data area of the buffer. If this would
1431 *	exceed the total buffer size the kernel will panic. A pointer to the
1432 *	first byte of the extra data is returned.
1433 */
1434unsigned char *skb_put(struct sk_buff *skb, unsigned int len)
1435{
1436	unsigned char *tmp = skb_tail_pointer(skb);
1437	SKB_LINEAR_ASSERT(skb);
1438	skb->tail += len;
1439	skb->len  += len;
1440	if (unlikely(skb->tail > skb->end))
1441		skb_over_panic(skb, len, __builtin_return_address(0));
1442	return tmp;
1443}
1444EXPORT_SYMBOL(skb_put);
1445
1446/**
1447 *	skb_push - add data to the start of a buffer
1448 *	@skb: buffer to use
1449 *	@len: amount of data to add
1450 *
1451 *	This function extends the used data area of the buffer at the buffer
1452 *	start. If this would exceed the total buffer headroom the kernel will
1453 *	panic. A pointer to the first byte of the extra data is returned.
1454 */
1455unsigned char *skb_push(struct sk_buff *skb, unsigned int len)
1456{
1457	skb->data -= len;
1458	skb->len  += len;
1459	if (unlikely(skb->data<skb->head))
1460		skb_under_panic(skb, len, __builtin_return_address(0));
1461	return skb->data;
1462}
1463EXPORT_SYMBOL(skb_push);
1464
1465/**
1466 *	skb_pull - remove data from the start of a buffer
1467 *	@skb: buffer to use
1468 *	@len: amount of data to remove
1469 *
1470 *	This function removes data from the start of a buffer, returning
1471 *	the memory to the headroom. A pointer to the next data in the buffer
1472 *	is returned. Once the data has been pulled future pushes will overwrite
1473 *	the old data.
1474 */
1475unsigned char *skb_pull(struct sk_buff *skb, unsigned int len)
1476{
1477	return skb_pull_inline(skb, len);
1478}
1479EXPORT_SYMBOL(skb_pull);
1480
1481/**
1482 *	skb_trim - remove end from a buffer
1483 *	@skb: buffer to alter
1484 *	@len: new length
1485 *
1486 *	Cut the length of a buffer down by removing data from the tail. If
1487 *	the buffer is already under the length specified it is not modified.
1488 *	The skb must be linear.
1489 */
1490void skb_trim(struct sk_buff *skb, unsigned int len)
1491{
1492	if (skb->len > len)
1493		__skb_trim(skb, len);
1494}
1495EXPORT_SYMBOL(skb_trim);
1496
1497/* Trims skb to length len. It can change skb pointers.
1498 */
1499
1500int ___pskb_trim(struct sk_buff *skb, unsigned int len)
1501{
1502	struct sk_buff **fragp;
1503	struct sk_buff *frag;
1504	int offset = skb_headlen(skb);
1505	int nfrags = skb_shinfo(skb)->nr_frags;
1506	int i;
1507	int err;
1508
1509	if (skb_cloned(skb) &&
1510	    unlikely((err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))))
1511		return err;
1512
1513	i = 0;
1514	if (offset >= len)
1515		goto drop_pages;
1516
1517	for (; i < nfrags; i++) {
1518		int end = offset + skb_frag_size(&skb_shinfo(skb)->frags[i]);
1519
1520		if (end < len) {
1521			offset = end;
1522			continue;
1523		}
1524
1525		skb_frag_size_set(&skb_shinfo(skb)->frags[i++], len - offset);
1526
1527drop_pages:
1528		skb_shinfo(skb)->nr_frags = i;
1529
1530		for (; i < nfrags; i++)
1531			skb_frag_unref(skb, i);
1532
1533		if (skb_has_frag_list(skb))
1534			skb_drop_fraglist(skb);
1535		goto done;
1536	}
1537
1538	for (fragp = &skb_shinfo(skb)->frag_list; (frag = *fragp);
1539	     fragp = &frag->next) {
1540		int end = offset + frag->len;
1541
1542		if (skb_shared(frag)) {
1543			struct sk_buff *nfrag;
1544
1545			nfrag = skb_clone(frag, GFP_ATOMIC);
1546			if (unlikely(!nfrag))
1547				return -ENOMEM;
1548
1549			nfrag->next = frag->next;
1550			consume_skb(frag);
1551			frag = nfrag;
1552			*fragp = frag;
1553		}
1554
1555		if (end < len) {
1556			offset = end;
1557			continue;
1558		}
1559
1560		if (end > len &&
1561		    unlikely((err = pskb_trim(frag, len - offset))))
1562			return err;
1563
1564		if (frag->next)
1565			skb_drop_list(&frag->next);
1566		break;
1567	}
1568
1569done:
1570	if (len > skb_headlen(skb)) {
1571		skb->data_len -= skb->len - len;
1572		skb->len       = len;
1573	} else {
1574		skb->len       = len;
1575		skb->data_len  = 0;
1576		skb_set_tail_pointer(skb, len);
1577	}
1578
1579	return 0;
1580}
1581EXPORT_SYMBOL(___pskb_trim);
1582
1583/**
1584 *	__pskb_pull_tail - advance tail of skb header
1585 *	@skb: buffer to reallocate
1586 *	@delta: number of bytes to advance tail
1587 *
1588 *	The function makes a sense only on a fragmented &sk_buff,
1589 *	it expands header moving its tail forward and copying necessary
1590 *	data from fragmented part.
1591 *
1592 *	&sk_buff MUST have reference count of 1.
1593 *
1594 *	Returns %NULL (and &sk_buff does not change) if pull failed
1595 *	or value of new tail of skb in the case of success.
1596 *
1597 *	All the pointers pointing into skb header may change and must be
1598 *	reloaded after call to this function.
1599 */
1600
1601/* Moves tail of skb head forward, copying data from fragmented part,
1602 * when it is necessary.
1603 * 1. It may fail due to malloc failure.
1604 * 2. It may change skb pointers.
1605 *
1606 * It is pretty complicated. Luckily, it is called only in exceptional cases.
1607 */
1608unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta)
1609{
1610	/* If skb has not enough free space at tail, get new one
1611	 * plus 128 bytes for future expansions. If we have enough
1612	 * room at tail, reallocate without expansion only if skb is cloned.
1613	 */
1614	int i, k, eat = (skb->tail + delta) - skb->end;
1615
1616	if (eat > 0 || skb_cloned(skb)) {
1617		if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0,
1618				     GFP_ATOMIC))
1619			return NULL;
1620	}
1621
1622	if (skb_copy_bits(skb, skb_headlen(skb), skb_tail_pointer(skb), delta))
1623		BUG();
1624
1625	/* Optimization: no fragments, no reasons to preestimate
1626	 * size of pulled pages. Superb.
1627	 */
1628	if (!skb_has_frag_list(skb))
1629		goto pull_pages;
1630
1631	/* Estimate size of pulled pages. */
1632	eat = delta;
1633	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1634		int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
1635
1636		if (size >= eat)
1637			goto pull_pages;
1638		eat -= size;
1639	}
1640
1641	/* If we need update frag list, we are in troubles.
1642	 * Certainly, it possible to add an offset to skb data,
1643	 * but taking into account that pulling is expected to
1644	 * be very rare operation, it is worth to fight against
1645	 * further bloating skb head and crucify ourselves here instead.
1646	 * Pure masohism, indeed. 8)8)
1647	 */
1648	if (eat) {
1649		struct sk_buff *list = skb_shinfo(skb)->frag_list;
1650		struct sk_buff *clone = NULL;
1651		struct sk_buff *insp = NULL;
1652
1653		do {
1654			BUG_ON(!list);
1655
1656			if (list->len <= eat) {
1657				/* Eaten as whole. */
1658				eat -= list->len;
1659				list = list->next;
1660				insp = list;
1661			} else {
1662				/* Eaten partially. */
1663
1664				if (skb_shared(list)) {
1665					/* Sucks! We need to fork list. :-( */
1666					clone = skb_clone(list, GFP_ATOMIC);
1667					if (!clone)
1668						return NULL;
1669					insp = list->next;
1670					list = clone;
1671				} else {
1672					/* This may be pulled without
1673					 * problems. */
1674					insp = list;
1675				}
1676				if (!pskb_pull(list, eat)) {
1677					kfree_skb(clone);
1678					return NULL;
1679				}
1680				break;
1681			}
1682		} while (eat);
1683
1684		/* Free pulled out fragments. */
1685		while ((list = skb_shinfo(skb)->frag_list) != insp) {
1686			skb_shinfo(skb)->frag_list = list->next;
1687			kfree_skb(list);
1688		}
1689		/* And insert new clone at head. */
1690		if (clone) {
1691			clone->next = list;
1692			skb_shinfo(skb)->frag_list = clone;
1693		}
1694	}
1695	/* Success! Now we may commit changes to skb data. */
1696
1697pull_pages:
1698	eat = delta;
1699	k = 0;
1700	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1701		int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
1702
1703		if (size <= eat) {
1704			skb_frag_unref(skb, i);
1705			eat -= size;
1706		} else {
1707			skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i];
1708			if (eat) {
1709				skb_shinfo(skb)->frags[k].page_offset += eat;
1710				skb_frag_size_sub(&skb_shinfo(skb)->frags[k], eat);
1711				eat = 0;
1712			}
1713			k++;
1714		}
1715	}
1716	skb_shinfo(skb)->nr_frags = k;
1717
1718	skb->tail     += delta;
1719	skb->data_len -= delta;
1720
1721	return skb_tail_pointer(skb);
1722}
1723EXPORT_SYMBOL(__pskb_pull_tail);
1724
1725/**
1726 *	skb_copy_bits - copy bits from skb to kernel buffer
1727 *	@skb: source skb
1728 *	@offset: offset in source
1729 *	@to: destination buffer
1730 *	@len: number of bytes to copy
1731 *
1732 *	Copy the specified number of bytes from the source skb to the
1733 *	destination buffer.
1734 *
1735 *	CAUTION ! :
1736 *		If its prototype is ever changed,
1737 *		check arch/{*}/net/{*}.S files,
1738 *		since it is called from BPF assembly code.
1739 */
1740int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len)
1741{
1742	int start = skb_headlen(skb);
1743	struct sk_buff *frag_iter;
1744	int i, copy;
1745
1746	if (offset > (int)skb->len - len)
1747		goto fault;
1748
1749	/* Copy header. */
1750	if ((copy = start - offset) > 0) {
1751		if (copy > len)
1752			copy = len;
1753		skb_copy_from_linear_data_offset(skb, offset, to, copy);
1754		if ((len -= copy) == 0)
1755			return 0;
1756		offset += copy;
1757		to     += copy;
1758	}
1759
1760	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1761		int end;
1762		skb_frag_t *f = &skb_shinfo(skb)->frags[i];
1763
1764		WARN_ON(start > offset + len);
1765
1766		end = start + skb_frag_size(f);
1767		if ((copy = end - offset) > 0) {
1768			u8 *vaddr;
1769
1770			if (copy > len)
1771				copy = len;
1772
1773			vaddr = kmap_atomic(skb_frag_page(f));
1774			memcpy(to,
1775			       vaddr + f->page_offset + offset - start,
1776			       copy);
1777			kunmap_atomic(vaddr);
1778
1779			if ((len -= copy) == 0)
1780				return 0;
1781			offset += copy;
1782			to     += copy;
1783		}
1784		start = end;
1785	}
1786
1787	skb_walk_frags(skb, frag_iter) {
1788		int end;
1789
1790		WARN_ON(start > offset + len);
1791
1792		end = start + frag_iter->len;
1793		if ((copy = end - offset) > 0) {
1794			if (copy > len)
1795				copy = len;
1796			if (skb_copy_bits(frag_iter, offset - start, to, copy))
1797				goto fault;
1798			if ((len -= copy) == 0)
1799				return 0;
1800			offset += copy;
1801			to     += copy;
1802		}
1803		start = end;
1804	}
1805
1806	if (!len)
1807		return 0;
1808
1809fault:
1810	return -EFAULT;
1811}
1812EXPORT_SYMBOL(skb_copy_bits);
1813
1814/*
1815 * Callback from splice_to_pipe(), if we need to release some pages
1816 * at the end of the spd in case we error'ed out in filling the pipe.
1817 */
1818static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i)
1819{
1820	put_page(spd->pages[i]);
1821}
1822
1823static struct page *linear_to_page(struct page *page, unsigned int *len,
1824				   unsigned int *offset,
1825				   struct sock *sk)
1826{
1827	struct page_frag *pfrag = sk_page_frag(sk);
1828
1829	if (!sk_page_frag_refill(sk, pfrag))
1830		return NULL;
1831
1832	*len = min_t(unsigned int, *len, pfrag->size - pfrag->offset);
1833
1834	memcpy(page_address(pfrag->page) + pfrag->offset,
1835	       page_address(page) + *offset, *len);
1836	*offset = pfrag->offset;
1837	pfrag->offset += *len;
1838
1839	return pfrag->page;
1840}
1841
1842static bool spd_can_coalesce(const struct splice_pipe_desc *spd,
1843			     struct page *page,
1844			     unsigned int offset)
1845{
1846	return	spd->nr_pages &&
1847		spd->pages[spd->nr_pages - 1] == page &&
1848		(spd->partial[spd->nr_pages - 1].offset +
1849		 spd->partial[spd->nr_pages - 1].len == offset);
1850}
1851
1852/*
1853 * Fill page/offset/length into spd, if it can hold more pages.
1854 */
1855static bool spd_fill_page(struct splice_pipe_desc *spd,
1856			  struct pipe_inode_info *pipe, struct page *page,
1857			  unsigned int *len, unsigned int offset,
1858			  bool linear,
1859			  struct sock *sk)
1860{
1861	if (unlikely(spd->nr_pages == MAX_SKB_FRAGS))
1862		return true;
1863
1864	if (linear) {
1865		page = linear_to_page(page, len, &offset, sk);
1866		if (!page)
1867			return true;
1868	}
1869	if (spd_can_coalesce(spd, page, offset)) {
1870		spd->partial[spd->nr_pages - 1].len += *len;
1871		return false;
1872	}
1873	get_page(page);
1874	spd->pages[spd->nr_pages] = page;
1875	spd->partial[spd->nr_pages].len = *len;
1876	spd->partial[spd->nr_pages].offset = offset;
1877	spd->nr_pages++;
1878
1879	return false;
1880}
1881
1882static bool __splice_segment(struct page *page, unsigned int poff,
1883			     unsigned int plen, unsigned int *off,
1884			     unsigned int *len,
1885			     struct splice_pipe_desc *spd, bool linear,
1886			     struct sock *sk,
1887			     struct pipe_inode_info *pipe)
1888{
1889	if (!*len)
1890		return true;
1891
1892	/* skip this segment if already processed */
1893	if (*off >= plen) {
1894		*off -= plen;
1895		return false;
1896	}
1897
1898	/* ignore any bits we already processed */
1899	poff += *off;
1900	plen -= *off;
1901	*off = 0;
1902
1903	do {
1904		unsigned int flen = min(*len, plen);
1905
1906		if (spd_fill_page(spd, pipe, page, &flen, poff,
1907				  linear, sk))
1908			return true;
1909		poff += flen;
1910		plen -= flen;
1911		*len -= flen;
1912	} while (*len && plen);
1913
1914	return false;
1915}
1916
1917/*
1918 * Map linear and fragment data from the skb to spd. It reports true if the
1919 * pipe is full or if we already spliced the requested length.
1920 */
1921static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
1922			      unsigned int *offset, unsigned int *len,
1923			      struct splice_pipe_desc *spd, struct sock *sk)
1924{
1925	int seg;
1926	struct sk_buff *iter;
1927
1928	/* map the linear part :
1929	 * If skb->head_frag is set, this 'linear' part is backed by a
1930	 * fragment, and if the head is not shared with any clones then
1931	 * we can avoid a copy since we own the head portion of this page.
1932	 */
1933	if (__splice_segment(virt_to_page(skb->data),
1934			     (unsigned long) skb->data & (PAGE_SIZE - 1),
1935			     skb_headlen(skb),
1936			     offset, len, spd,
1937			     skb_head_is_locked(skb),
1938			     sk, pipe))
1939		return true;
1940
1941	/*
1942	 * then map the fragments
1943	 */
1944	for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) {
1945		const skb_frag_t *f = &skb_shinfo(skb)->frags[seg];
1946
1947		if (__splice_segment(skb_frag_page(f),
1948				     f->page_offset, skb_frag_size(f),
1949				     offset, len, spd, false, sk, pipe))
1950			return true;
1951	}
1952
1953	skb_walk_frags(skb, iter) {
1954		if (*offset >= iter->len) {
1955			*offset -= iter->len;
1956			continue;
1957		}
1958		/* __skb_splice_bits() only fails if the output has no room
1959		 * left, so no point in going over the frag_list for the error
1960		 * case.
1961		 */
1962		if (__skb_splice_bits(iter, pipe, offset, len, spd, sk))
1963			return true;
1964	}
1965
1966	return false;
1967}
1968
1969/*
1970 * Map data from the skb to a pipe. Should handle both the linear part,
1971 * the fragments, and the frag list.
1972 */
1973int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset,
1974		    struct pipe_inode_info *pipe, unsigned int tlen,
1975		    unsigned int flags)
1976{
1977	struct partial_page partial[MAX_SKB_FRAGS];
1978	struct page *pages[MAX_SKB_FRAGS];
1979	struct splice_pipe_desc spd = {
1980		.pages = pages,
1981		.partial = partial,
1982		.nr_pages_max = MAX_SKB_FRAGS,
1983		.flags = flags,
1984		.ops = &nosteal_pipe_buf_ops,
1985		.spd_release = sock_spd_release,
1986	};
1987	int ret = 0;
1988
1989	__skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk);
1990
1991	if (spd.nr_pages)
1992		ret = splice_to_pipe(pipe, &spd);
1993
1994	return ret;
1995}
1996EXPORT_SYMBOL_GPL(skb_splice_bits);
1997
1998/**
1999 *	skb_store_bits - store bits from kernel buffer to skb
2000 *	@skb: destination buffer
2001 *	@offset: offset in destination
2002 *	@from: source buffer
2003 *	@len: number of bytes to copy
2004 *
2005 *	Copy the specified number of bytes from the source buffer to the
2006 *	destination skb.  This function handles all the messy bits of
2007 *	traversing fragment lists and such.
2008 */
2009
2010int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len)
2011{
2012	int start = skb_headlen(skb);
2013	struct sk_buff *frag_iter;
2014	int i, copy;
2015
2016	if (offset > (int)skb->len - len)
2017		goto fault;
2018
2019	if ((copy = start - offset) > 0) {
2020		if (copy > len)
2021			copy = len;
2022		skb_copy_to_linear_data_offset(skb, offset, from, copy);
2023		if ((len -= copy) == 0)
2024			return 0;
2025		offset += copy;
2026		from += copy;
2027	}
2028
2029	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2030		skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2031		int end;
2032
2033		WARN_ON(start > offset + len);
2034
2035		end = start + skb_frag_size(frag);
2036		if ((copy = end - offset) > 0) {
2037			u8 *vaddr;
2038
2039			if (copy > len)
2040				copy = len;
2041
2042			vaddr = kmap_atomic(skb_frag_page(frag));
2043			memcpy(vaddr + frag->page_offset + offset - start,
2044			       from, copy);
2045			kunmap_atomic(vaddr);
2046
2047			if ((len -= copy) == 0)
2048				return 0;
2049			offset += copy;
2050			from += copy;
2051		}
2052		start = end;
2053	}
2054
2055	skb_walk_frags(skb, frag_iter) {
2056		int end;
2057
2058		WARN_ON(start > offset + len);
2059
2060		end = start + frag_iter->len;
2061		if ((copy = end - offset) > 0) {
2062			if (copy > len)
2063				copy = len;
2064			if (skb_store_bits(frag_iter, offset - start,
2065					   from, copy))
2066				goto fault;
2067			if ((len -= copy) == 0)
2068				return 0;
2069			offset += copy;
2070			from += copy;
2071		}
2072		start = end;
2073	}
2074	if (!len)
2075		return 0;
2076
2077fault:
2078	return -EFAULT;
2079}
2080EXPORT_SYMBOL(skb_store_bits);
2081
2082/* Checksum skb data. */
2083__wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
2084		      __wsum csum, const struct skb_checksum_ops *ops)
2085{
2086	int start = skb_headlen(skb);
2087	int i, copy = start - offset;
2088	struct sk_buff *frag_iter;
2089	int pos = 0;
2090
2091	/* Checksum header. */
2092	if (copy > 0) {
2093		if (copy > len)
2094			copy = len;
2095		csum = ops->update(skb->data + offset, copy, csum);
2096		if ((len -= copy) == 0)
2097			return csum;
2098		offset += copy;
2099		pos	= copy;
2100	}
2101
2102	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2103		int end;
2104		skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2105
2106		WARN_ON(start > offset + len);
2107
2108		end = start + skb_frag_size(frag);
2109		if ((copy = end - offset) > 0) {
2110			__wsum csum2;
2111			u8 *vaddr;
2112
2113			if (copy > len)
2114				copy = len;
2115			vaddr = kmap_atomic(skb_frag_page(frag));
2116			csum2 = ops->update(vaddr + frag->page_offset +
2117					    offset - start, copy, 0);
2118			kunmap_atomic(vaddr);
2119			csum = ops->combine(csum, csum2, pos, copy);
2120			if (!(len -= copy))
2121				return csum;
2122			offset += copy;
2123			pos    += copy;
2124		}
2125		start = end;
2126	}
2127
2128	skb_walk_frags(skb, frag_iter) {
2129		int end;
2130
2131		WARN_ON(start > offset + len);
2132
2133		end = start + frag_iter->len;
2134		if ((copy = end - offset) > 0) {
2135			__wsum csum2;
2136			if (copy > len)
2137				copy = len;
2138			csum2 = __skb_checksum(frag_iter, offset - start,
2139					       copy, 0, ops);
2140			csum = ops->combine(csum, csum2, pos, copy);
2141			if ((len -= copy) == 0)
2142				return csum;
2143			offset += copy;
2144			pos    += copy;
2145		}
2146		start = end;
2147	}
2148	BUG_ON(len);
2149
2150	return csum;
2151}
2152EXPORT_SYMBOL(__skb_checksum);
2153
2154__wsum skb_checksum(const struct sk_buff *skb, int offset,
2155		    int len, __wsum csum)
2156{
2157	const struct skb_checksum_ops ops = {
2158		.update  = csum_partial_ext,
2159		.combine = csum_block_add_ext,
2160	};
2161
2162	return __skb_checksum(skb, offset, len, csum, &ops);
2163}
2164EXPORT_SYMBOL(skb_checksum);
2165
2166/* Both of above in one bottle. */
2167
2168__wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
2169				    u8 *to, int len, __wsum csum)
2170{
2171	int start = skb_headlen(skb);
2172	int i, copy = start - offset;
2173	struct sk_buff *frag_iter;
2174	int pos = 0;
2175
2176	/* Copy header. */
2177	if (copy > 0) {
2178		if (copy > len)
2179			copy = len;
2180		csum = csum_partial_copy_nocheck(skb->data + offset, to,
2181						 copy, csum);
2182		if ((len -= copy) == 0)
2183			return csum;
2184		offset += copy;
2185		to     += copy;
2186		pos	= copy;
2187	}
2188
2189	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2190		int end;
2191
2192		WARN_ON(start > offset + len);
2193
2194		end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]);
2195		if ((copy = end - offset) > 0) {
2196			__wsum csum2;
2197			u8 *vaddr;
2198			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2199
2200			if (copy > len)
2201				copy = len;
2202			vaddr = kmap_atomic(skb_frag_page(frag));
2203			csum2 = csum_partial_copy_nocheck(vaddr +
2204							  frag->page_offset +
2205							  offset - start, to,
2206							  copy, 0);
2207			kunmap_atomic(vaddr);
2208			csum = csum_block_add(csum, csum2, pos);
2209			if (!(len -= copy))
2210				return csum;
2211			offset += copy;
2212			to     += copy;
2213			pos    += copy;
2214		}
2215		start = end;
2216	}
2217
2218	skb_walk_frags(skb, frag_iter) {
2219		__wsum csum2;
2220		int end;
2221
2222		WARN_ON(start > offset + len);
2223
2224		end = start + frag_iter->len;
2225		if ((copy = end - offset) > 0) {
2226			if (copy > len)
2227				copy = len;
2228			csum2 = skb_copy_and_csum_bits(frag_iter,
2229						       offset - start,
2230						       to, copy, 0);
2231			csum = csum_block_add(csum, csum2, pos);
2232			if ((len -= copy) == 0)
2233				return csum;
2234			offset += copy;
2235			to     += copy;
2236			pos    += copy;
2237		}
2238		start = end;
2239	}
2240	BUG_ON(len);
2241	return csum;
2242}
2243EXPORT_SYMBOL(skb_copy_and_csum_bits);
2244
2245 /**
2246 *	skb_zerocopy_headlen - Calculate headroom needed for skb_zerocopy()
2247 *	@from: source buffer
2248 *
2249 *	Calculates the amount of linear headroom needed in the 'to' skb passed
2250 *	into skb_zerocopy().
2251 */
2252unsigned int
2253skb_zerocopy_headlen(const struct sk_buff *from)
2254{
2255	unsigned int hlen = 0;
2256
2257	if (!from->head_frag ||
2258	    skb_headlen(from) < L1_CACHE_BYTES ||
2259	    skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS)
2260		hlen = skb_headlen(from);
2261
2262	if (skb_has_frag_list(from))
2263		hlen = from->len;
2264
2265	return hlen;
2266}
2267EXPORT_SYMBOL_GPL(skb_zerocopy_headlen);
2268
2269/**
2270 *	skb_zerocopy - Zero copy skb to skb
2271 *	@to: destination buffer
2272 *	@from: source buffer
2273 *	@len: number of bytes to copy from source buffer
2274 *	@hlen: size of linear headroom in destination buffer
2275 *
2276 *	Copies up to `len` bytes from `from` to `to` by creating references
2277 *	to the frags in the source buffer.
2278 *
2279 *	The `hlen` as calculated by skb_zerocopy_headlen() specifies the
2280 *	headroom in the `to` buffer.
2281 *
2282 *	Return value:
2283 *	0: everything is OK
2284 *	-ENOMEM: couldn't orphan frags of @from due to lack of memory
2285 *	-EFAULT: skb_copy_bits() found some problem with skb geometry
2286 */
2287int
2288skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen)
2289{
2290	int i, j = 0;
2291	int plen = 0; /* length of skb->head fragment */
2292	int ret;
2293	struct page *page;
2294	unsigned int offset;
2295
2296	BUG_ON(!from->head_frag && !hlen);
2297
2298	/* dont bother with small payloads */
2299	if (len <= skb_tailroom(to))
2300		return skb_copy_bits(from, 0, skb_put(to, len), len);
2301
2302	if (hlen) {
2303		ret = skb_copy_bits(from, 0, skb_put(to, hlen), hlen);
2304		if (unlikely(ret))
2305			return ret;
2306		len -= hlen;
2307	} else {
2308		plen = min_t(int, skb_headlen(from), len);
2309		if (plen) {
2310			page = virt_to_head_page(from->head);
2311			offset = from->data - (unsigned char *)page_address(page);
2312			__skb_fill_page_desc(to, 0, page, offset, plen);
2313			get_page(page);
2314			j = 1;
2315			len -= plen;
2316		}
2317	}
2318
2319	to->truesize += len + plen;
2320	to->len += len + plen;
2321	to->data_len += len + plen;
2322
2323	if (unlikely(skb_orphan_frags(from, GFP_ATOMIC))) {
2324		skb_tx_error(from);
2325		return -ENOMEM;
2326	}
2327
2328	for (i = 0; i < skb_shinfo(from)->nr_frags; i++) {
2329		if (!len)
2330			break;
2331		skb_shinfo(to)->frags[j] = skb_shinfo(from)->frags[i];
2332		skb_shinfo(to)->frags[j].size = min_t(int, skb_shinfo(to)->frags[j].size, len);
2333		len -= skb_shinfo(to)->frags[j].size;
2334		skb_frag_ref(to, j);
2335		j++;
2336	}
2337	skb_shinfo(to)->nr_frags = j;
2338
2339	return 0;
2340}
2341EXPORT_SYMBOL_GPL(skb_zerocopy);
2342
2343void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to)
2344{
2345	__wsum csum;
2346	long csstart;
2347
2348	if (skb->ip_summed == CHECKSUM_PARTIAL)
2349		csstart = skb_checksum_start_offset(skb);
2350	else
2351		csstart = skb_headlen(skb);
2352
2353	BUG_ON(csstart > skb_headlen(skb));
2354
2355	skb_copy_from_linear_data(skb, to, csstart);
2356
2357	csum = 0;
2358	if (csstart != skb->len)
2359		csum = skb_copy_and_csum_bits(skb, csstart, to + csstart,
2360					      skb->len - csstart, 0);
2361
2362	if (skb->ip_summed == CHECKSUM_PARTIAL) {
2363		long csstuff = csstart + skb->csum_offset;
2364
2365		*((__sum16 *)(to + csstuff)) = csum_fold(csum);
2366	}
2367}
2368EXPORT_SYMBOL(skb_copy_and_csum_dev);
2369
2370/**
2371 *	skb_dequeue - remove from the head of the queue
2372 *	@list: list to dequeue from
2373 *
2374 *	Remove the head of the list. The list lock is taken so the function
2375 *	may be used safely with other locking list functions. The head item is
2376 *	returned or %NULL if the list is empty.
2377 */
2378
2379struct sk_buff *skb_dequeue(struct sk_buff_head *list)
2380{
2381	unsigned long flags;
2382	struct sk_buff *result;
2383
2384	spin_lock_irqsave(&list->lock, flags);
2385	result = __skb_dequeue(list);
2386	spin_unlock_irqrestore(&list->lock, flags);
2387	return result;
2388}
2389EXPORT_SYMBOL(skb_dequeue);
2390
2391/**
2392 *	skb_dequeue_tail - remove from the tail of the queue
2393 *	@list: list to dequeue from
2394 *
2395 *	Remove the tail of the list. The list lock is taken so the function
2396 *	may be used safely with other locking list functions. The tail item is
2397 *	returned or %NULL if the list is empty.
2398 */
2399struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list)
2400{
2401	unsigned long flags;
2402	struct sk_buff *result;
2403
2404	spin_lock_irqsave(&list->lock, flags);
2405	result = __skb_dequeue_tail(list);
2406	spin_unlock_irqrestore(&list->lock, flags);
2407	return result;
2408}
2409EXPORT_SYMBOL(skb_dequeue_tail);
2410
2411/**
2412 *	skb_queue_purge - empty a list
2413 *	@list: list to empty
2414 *
2415 *	Delete all buffers on an &sk_buff list. Each buffer is removed from
2416 *	the list and one reference dropped. This function takes the list
2417 *	lock and is atomic with respect to other list locking functions.
2418 */
2419void skb_queue_purge(struct sk_buff_head *list)
2420{
2421	struct sk_buff *skb;
2422	while ((skb = skb_dequeue(list)) != NULL)
2423		kfree_skb(skb);
2424}
2425EXPORT_SYMBOL(skb_queue_purge);
2426
2427/**
2428 *	skb_rbtree_purge - empty a skb rbtree
2429 *	@root: root of the rbtree to empty
2430 *
2431 *	Delete all buffers on an &sk_buff rbtree. Each buffer is removed from
2432 *	the list and one reference dropped. This function does not take
2433 *	any lock. Synchronization should be handled by the caller (e.g., TCP
2434 *	out-of-order queue is protected by the socket lock).
2435 */
2436void skb_rbtree_purge(struct rb_root *root)
2437{
2438	struct sk_buff *skb, *next;
2439
2440	rbtree_postorder_for_each_entry_safe(skb, next, root, rbnode)
2441		kfree_skb(skb);
2442
2443	*root = RB_ROOT;
2444}
2445
2446/**
2447 *	skb_queue_head - queue a buffer at the list head
2448 *	@list: list to use
2449 *	@newsk: buffer to queue
2450 *
2451 *	Queue a buffer at the start of the list. This function takes the
2452 *	list lock and can be used safely with other locking &sk_buff functions
2453 *	safely.
2454 *
2455 *	A buffer cannot be placed on two lists at the same time.
2456 */
2457void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk)
2458{
2459	unsigned long flags;
2460
2461	spin_lock_irqsave(&list->lock, flags);
2462	__skb_queue_head(list, newsk);
2463	spin_unlock_irqrestore(&list->lock, flags);
2464}
2465EXPORT_SYMBOL(skb_queue_head);
2466
2467/**
2468 *	skb_queue_tail - queue a buffer at the list tail
2469 *	@list: list to use
2470 *	@newsk: buffer to queue
2471 *
2472 *	Queue a buffer at the tail of the list. This function takes the
2473 *	list lock and can be used safely with other locking &sk_buff functions
2474 *	safely.
2475 *
2476 *	A buffer cannot be placed on two lists at the same time.
2477 */
2478void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk)
2479{
2480	unsigned long flags;
2481
2482	spin_lock_irqsave(&list->lock, flags);
2483	__skb_queue_tail(list, newsk);
2484	spin_unlock_irqrestore(&list->lock, flags);
2485}
2486EXPORT_SYMBOL(skb_queue_tail);
2487
2488/**
2489 *	skb_unlink	-	remove a buffer from a list
2490 *	@skb: buffer to remove
2491 *	@list: list to use
2492 *
2493 *	Remove a packet from a list. The list locks are taken and this
2494 *	function is atomic with respect to other list locked calls
2495 *
2496 *	You must know what list the SKB is on.
2497 */
2498void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list)
2499{
2500	unsigned long flags;
2501
2502	spin_lock_irqsave(&list->lock, flags);
2503	__skb_unlink(skb, list);
2504	spin_unlock_irqrestore(&list->lock, flags);
2505}
2506EXPORT_SYMBOL(skb_unlink);
2507
2508/**
2509 *	skb_append	-	append a buffer
2510 *	@old: buffer to insert after
2511 *	@newsk: buffer to insert
2512 *	@list: list to use
2513 *
2514 *	Place a packet after a given packet in a list. The list locks are taken
2515 *	and this function is atomic with respect to other list locked calls.
2516 *	A buffer cannot be placed on two lists at the same time.
2517 */
2518void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list)
2519{
2520	unsigned long flags;
2521
2522	spin_lock_irqsave(&list->lock, flags);
2523	__skb_queue_after(list, old, newsk);
2524	spin_unlock_irqrestore(&list->lock, flags);
2525}
2526EXPORT_SYMBOL(skb_append);
2527
2528/**
2529 *	skb_insert	-	insert a buffer
2530 *	@old: buffer to insert before
2531 *	@newsk: buffer to insert
2532 *	@list: list to use
2533 *
2534 *	Place a packet before a given packet in a list. The list locks are
2535 * 	taken and this function is atomic with respect to other list locked
2536 *	calls.
2537 *
2538 *	A buffer cannot be placed on two lists at the same time.
2539 */
2540void skb_insert(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list)
2541{
2542	unsigned long flags;
2543
2544	spin_lock_irqsave(&list->lock, flags);
2545	__skb_insert(newsk, old->prev, old, list);
2546	spin_unlock_irqrestore(&list->lock, flags);
2547}
2548EXPORT_SYMBOL(skb_insert);
2549
2550static inline void skb_split_inside_header(struct sk_buff *skb,
2551					   struct sk_buff* skb1,
2552					   const u32 len, const int pos)
2553{
2554	int i;
2555
2556	skb_copy_from_linear_data_offset(skb, len, skb_put(skb1, pos - len),
2557					 pos - len);
2558	/* And move data appendix as is. */
2559	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
2560		skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i];
2561
2562	skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags;
2563	skb_shinfo(skb)->nr_frags  = 0;
2564	skb1->data_len		   = skb->data_len;
2565	skb1->len		   += skb1->data_len;
2566	skb->data_len		   = 0;
2567	skb->len		   = len;
2568	skb_set_tail_pointer(skb, len);
2569}
2570
2571static inline void skb_split_no_header(struct sk_buff *skb,
2572				       struct sk_buff* skb1,
2573				       const u32 len, int pos)
2574{
2575	int i, k = 0;
2576	const int nfrags = skb_shinfo(skb)->nr_frags;
2577
2578	skb_shinfo(skb)->nr_frags = 0;
2579	skb1->len		  = skb1->data_len = skb->len - len;
2580	skb->len		  = len;
2581	skb->data_len		  = len - pos;
2582
2583	for (i = 0; i < nfrags; i++) {
2584		int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
2585
2586		if (pos + size > len) {
2587			skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i];
2588
2589			if (pos < len) {
2590				/* Split frag.
2591				 * We have two variants in this case:
2592				 * 1. Move all the frag to the second
2593				 *    part, if it is possible. F.e.
2594				 *    this approach is mandatory for TUX,
2595				 *    where splitting is expensive.
2596				 * 2. Split is accurately. We make this.
2597				 */
2598				skb_frag_ref(skb, i);
2599				skb_shinfo(skb1)->frags[0].page_offset += len - pos;
2600				skb_frag_size_sub(&skb_shinfo(skb1)->frags[0], len - pos);
2601				skb_frag_size_set(&skb_shinfo(skb)->frags[i], len - pos);
2602				skb_shinfo(skb)->nr_frags++;
2603			}
2604			k++;
2605		} else
2606			skb_shinfo(skb)->nr_frags++;
2607		pos += size;
2608	}
2609	skb_shinfo(skb1)->nr_frags = k;
2610}
2611
2612/**
2613 * skb_split - Split fragmented skb to two parts at length len.
2614 * @skb: the buffer to split
2615 * @skb1: the buffer to receive the second part
2616 * @len: new length for skb
2617 */
2618void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len)
2619{
2620	int pos = skb_headlen(skb);
2621
2622	skb_shinfo(skb1)->tx_flags = skb_shinfo(skb)->tx_flags & SKBTX_SHARED_FRAG;
2623	if (len < pos)	/* Split line is inside header. */
2624		skb_split_inside_header(skb, skb1, len, pos);
2625	else		/* Second chunk has no header, nothing to copy. */
2626		skb_split_no_header(skb, skb1, len, pos);
2627}
2628EXPORT_SYMBOL(skb_split);
2629
2630/* Shifting from/to a cloned skb is a no-go.
2631 *
2632 * Caller cannot keep skb_shinfo related pointers past calling here!
2633 */
2634static int skb_prepare_for_shift(struct sk_buff *skb)
2635{
2636	return skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2637}
2638
2639/**
2640 * skb_shift - Shifts paged data partially from skb to another
2641 * @tgt: buffer into which tail data gets added
2642 * @skb: buffer from which the paged data comes from
2643 * @shiftlen: shift up to this many bytes
2644 *
2645 * Attempts to shift up to shiftlen worth of bytes, which may be less than
2646 * the length of the skb, from skb to tgt. Returns number bytes shifted.
2647 * It's up to caller to free skb if everything was shifted.
2648 *
2649 * If @tgt runs out of frags, the whole operation is aborted.
2650 *
2651 * Skb cannot include anything else but paged data while tgt is allowed
2652 * to have non-paged data as well.
2653 *
2654 * TODO: full sized shift could be optimized but that would need
2655 * specialized skb free'er to handle frags without up-to-date nr_frags.
2656 */
2657int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
2658{
2659	int from, to, merge, todo;
2660	struct skb_frag_struct *fragfrom, *fragto;
2661
2662	BUG_ON(shiftlen > skb->len);
2663
2664	if (skb_headlen(skb))
2665		return 0;
2666
2667	todo = shiftlen;
2668	from = 0;
2669	to = skb_shinfo(tgt)->nr_frags;
2670	fragfrom = &skb_shinfo(skb)->frags[from];
2671
2672	/* Actual merge is delayed until the point when we know we can
2673	 * commit all, so that we don't have to undo partial changes
2674	 */
2675	if (!to ||
2676	    !skb_can_coalesce(tgt, to, skb_frag_page(fragfrom),
2677			      fragfrom->page_offset)) {
2678		merge = -1;
2679	} else {
2680		merge = to - 1;
2681
2682		todo -= skb_frag_size(fragfrom);
2683		if (todo < 0) {
2684			if (skb_prepare_for_shift(skb) ||
2685			    skb_prepare_for_shift(tgt))
2686				return 0;
2687
2688			/* All previous frag pointers might be stale! */
2689			fragfrom = &skb_shinfo(skb)->frags[from];
2690			fragto = &skb_shinfo(tgt)->frags[merge];
2691
2692			skb_frag_size_add(fragto, shiftlen);
2693			skb_frag_size_sub(fragfrom, shiftlen);
2694			fragfrom->page_offset += shiftlen;
2695
2696			goto onlymerged;
2697		}
2698
2699		from++;
2700	}
2701
2702	/* Skip full, not-fitting skb to avoid expensive operations */
2703	if ((shiftlen == skb->len) &&
2704	    (skb_shinfo(skb)->nr_frags - from) > (MAX_SKB_FRAGS - to))
2705		return 0;
2706
2707	if (skb_prepare_for_shift(skb) || skb_prepare_for_shift(tgt))
2708		return 0;
2709
2710	while ((todo > 0) && (from < skb_shinfo(skb)->nr_frags)) {
2711		if (to == MAX_SKB_FRAGS)
2712			return 0;
2713
2714		fragfrom = &skb_shinfo(skb)->frags[from];
2715		fragto = &skb_shinfo(tgt)->frags[to];
2716
2717		if (todo >= skb_frag_size(fragfrom)) {
2718			*fragto = *fragfrom;
2719			todo -= skb_frag_size(fragfrom);
2720			from++;
2721			to++;
2722
2723		} else {
2724			__skb_frag_ref(fragfrom);
2725			fragto->page = fragfrom->page;
2726			fragto->page_offset = fragfrom->page_offset;
2727			skb_frag_size_set(fragto, todo);
2728
2729			fragfrom->page_offset += todo;
2730			skb_frag_size_sub(fragfrom, todo);
2731			todo = 0;
2732
2733			to++;
2734			break;
2735		}
2736	}
2737
2738	/* Ready to "commit" this state change to tgt */
2739	skb_shinfo(tgt)->nr_frags = to;
2740
2741	if (merge >= 0) {
2742		fragfrom = &skb_shinfo(skb)->frags[0];
2743		fragto = &skb_shinfo(tgt)->frags[merge];
2744
2745		skb_frag_size_add(fragto, skb_frag_size(fragfrom));
2746		__skb_frag_unref(fragfrom);
2747	}
2748
2749	/* Reposition in the original skb */
2750	to = 0;
2751	while (from < skb_shinfo(skb)->nr_frags)
2752		skb_shinfo(skb)->frags[to++] = skb_shinfo(skb)->frags[from++];
2753	skb_shinfo(skb)->nr_frags = to;
2754
2755	BUG_ON(todo > 0 && !skb_shinfo(skb)->nr_frags);
2756
2757onlymerged:
2758	/* Most likely the tgt won't ever need its checksum anymore, skb on
2759	 * the other hand might need it if it needs to be resent
2760	 */
2761	tgt->ip_summed = CHECKSUM_PARTIAL;
2762	skb->ip_summed = CHECKSUM_PARTIAL;
2763
2764	/* Yak, is it really working this way? Some helper please? */
2765	skb->len -= shiftlen;
2766	skb->data_len -= shiftlen;
2767	skb->truesize -= shiftlen;
2768	tgt->len += shiftlen;
2769	tgt->data_len += shiftlen;
2770	tgt->truesize += shiftlen;
2771
2772	return shiftlen;
2773}
2774
2775/**
2776 * skb_prepare_seq_read - Prepare a sequential read of skb data
2777 * @skb: the buffer to read
2778 * @from: lower offset of data to be read
2779 * @to: upper offset of data to be read
2780 * @st: state variable
2781 *
2782 * Initializes the specified state variable. Must be called before
2783 * invoking skb_seq_read() for the first time.
2784 */
2785void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from,
2786			  unsigned int to, struct skb_seq_state *st)
2787{
2788	st->lower_offset = from;
2789	st->upper_offset = to;
2790	st->root_skb = st->cur_skb = skb;
2791	st->frag_idx = st->stepped_offset = 0;
2792	st->frag_data = NULL;
2793}
2794EXPORT_SYMBOL(skb_prepare_seq_read);
2795
2796/**
2797 * skb_seq_read - Sequentially read skb data
2798 * @consumed: number of bytes consumed by the caller so far
2799 * @data: destination pointer for data to be returned
2800 * @st: state variable
2801 *
2802 * Reads a block of skb data at @consumed relative to the
2803 * lower offset specified to skb_prepare_seq_read(). Assigns
2804 * the head of the data block to @data and returns the length
2805 * of the block or 0 if the end of the skb data or the upper
2806 * offset has been reached.
2807 *
2808 * The caller is not required to consume all of the data
2809 * returned, i.e. @consumed is typically set to the number
2810 * of bytes already consumed and the next call to
2811 * skb_seq_read() will return the remaining part of the block.
2812 *
2813 * Note 1: The size of each block of data returned can be arbitrary,
2814 *       this limitation is the cost for zerocopy sequential
2815 *       reads of potentially non linear data.
2816 *
2817 * Note 2: Fragment lists within fragments are not implemented
2818 *       at the moment, state->root_skb could be replaced with
2819 *       a stack for this purpose.
2820 */
2821unsigned int skb_seq_read(unsigned int consumed, const u8 **data,
2822			  struct skb_seq_state *st)
2823{
2824	unsigned int block_limit, abs_offset = consumed + st->lower_offset;
2825	skb_frag_t *frag;
2826
2827	if (unlikely(abs_offset >= st->upper_offset)) {
2828		if (st->frag_data) {
2829			kunmap_atomic(st->frag_data);
2830			st->frag_data = NULL;
2831		}
2832		return 0;
2833	}
2834
2835next_skb:
2836	block_limit = skb_headlen(st->cur_skb) + st->stepped_offset;
2837
2838	if (abs_offset < block_limit && !st->frag_data) {
2839		*data = st->cur_skb->data + (abs_offset - st->stepped_offset);
2840		return block_limit - abs_offset;
2841	}
2842
2843	if (st->frag_idx == 0 && !st->frag_data)
2844		st->stepped_offset += skb_headlen(st->cur_skb);
2845
2846	while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) {
2847		frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx];
2848		block_limit = skb_frag_size(frag) + st->stepped_offset;
2849
2850		if (abs_offset < block_limit) {
2851			if (!st->frag_data)
2852				st->frag_data = kmap_atomic(skb_frag_page(frag));
2853
2854			*data = (u8 *) st->frag_data + frag->page_offset +
2855				(abs_offset - st->stepped_offset);
2856
2857			return block_limit - abs_offset;
2858		}
2859
2860		if (st->frag_data) {
2861			kunmap_atomic(st->frag_data);
2862			st->frag_data = NULL;
2863		}
2864
2865		st->frag_idx++;
2866		st->stepped_offset += skb_frag_size(frag);
2867	}
2868
2869	if (st->frag_data) {
2870		kunmap_atomic(st->frag_data);
2871		st->frag_data = NULL;
2872	}
2873
2874	if (st->root_skb == st->cur_skb && skb_has_frag_list(st->root_skb)) {
2875		st->cur_skb = skb_shinfo(st->root_skb)->frag_list;
2876		st->frag_idx = 0;
2877		goto next_skb;
2878	} else if (st->cur_skb->next) {
2879		st->cur_skb = st->cur_skb->next;
2880		st->frag_idx = 0;
2881		goto next_skb;
2882	}
2883
2884	return 0;
2885}
2886EXPORT_SYMBOL(skb_seq_read);
2887
2888/**
2889 * skb_abort_seq_read - Abort a sequential read of skb data
2890 * @st: state variable
2891 *
2892 * Must be called if skb_seq_read() was not called until it
2893 * returned 0.
2894 */
2895void skb_abort_seq_read(struct skb_seq_state *st)
2896{
2897	if (st->frag_data)
2898		kunmap_atomic(st->frag_data);
2899}
2900EXPORT_SYMBOL(skb_abort_seq_read);
2901
2902#define TS_SKB_CB(state)	((struct skb_seq_state *) &((state)->cb))
2903
2904static unsigned int skb_ts_get_next_block(unsigned int offset, const u8 **text,
2905					  struct ts_config *conf,
2906					  struct ts_state *state)
2907{
2908	return skb_seq_read(offset, text, TS_SKB_CB(state));
2909}
2910
2911static void skb_ts_finish(struct ts_config *conf, struct ts_state *state)
2912{
2913	skb_abort_seq_read(TS_SKB_CB(state));
2914}
2915
2916/**
2917 * skb_find_text - Find a text pattern in skb data
2918 * @skb: the buffer to look in
2919 * @from: search offset
2920 * @to: search limit
2921 * @config: textsearch configuration
2922 *
2923 * Finds a pattern in the skb data according to the specified
2924 * textsearch configuration. Use textsearch_next() to retrieve
2925 * subsequent occurrences of the pattern. Returns the offset
2926 * to the first occurrence or UINT_MAX if no match was found.
2927 */
2928unsigned int skb_find_text(struct sk_buff *skb, unsigned int from,
2929			   unsigned int to, struct ts_config *config)
2930{
2931	struct ts_state state;
2932	unsigned int ret;
2933
2934	config->get_next_block = skb_ts_get_next_block;
2935	config->finish = skb_ts_finish;
2936
2937	skb_prepare_seq_read(skb, from, to, TS_SKB_CB(&state));
2938
2939	ret = textsearch_find(config, &state);
2940	return (ret <= to - from ? ret : UINT_MAX);
2941}
2942EXPORT_SYMBOL(skb_find_text);
2943
2944/**
2945 * skb_append_datato_frags - append the user data to a skb
2946 * @sk: sock  structure
2947 * @skb: skb structure to be appended with user data.
2948 * @getfrag: call back function to be used for getting the user data
2949 * @from: pointer to user message iov
2950 * @length: length of the iov message
2951 *
2952 * Description: This procedure append the user data in the fragment part
2953 * of the skb if any page alloc fails user this procedure returns  -ENOMEM
2954 */
2955int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb,
2956			int (*getfrag)(void *from, char *to, int offset,
2957					int len, int odd, struct sk_buff *skb),
2958			void *from, int length)
2959{
2960	int frg_cnt = skb_shinfo(skb)->nr_frags;
2961	int copy;
2962	int offset = 0;
2963	int ret;
2964	struct page_frag *pfrag = &current->task_frag;
2965
2966	do {
2967		/* Return error if we don't have space for new frag */
2968		if (frg_cnt >= MAX_SKB_FRAGS)
2969			return -EMSGSIZE;
2970
2971		if (!sk_page_frag_refill(sk, pfrag))
2972			return -ENOMEM;
2973
2974		/* copy the user data to page */
2975		copy = min_t(int, length, pfrag->size - pfrag->offset);
2976
2977		ret = getfrag(from, page_address(pfrag->page) + pfrag->offset,
2978			      offset, copy, 0, skb);
2979		if (ret < 0)
2980			return -EFAULT;
2981
2982		/* copy was successful so update the size parameters */
2983		skb_fill_page_desc(skb, frg_cnt, pfrag->page, pfrag->offset,
2984				   copy);
2985		frg_cnt++;
2986		pfrag->offset += copy;
2987		get_page(pfrag->page);
2988
2989		skb->truesize += copy;
2990		atomic_add(copy, &sk->sk_wmem_alloc);
2991		skb->len += copy;
2992		skb->data_len += copy;
2993		offset += copy;
2994		length -= copy;
2995
2996	} while (length > 0);
2997
2998	return 0;
2999}
3000EXPORT_SYMBOL(skb_append_datato_frags);
3001
3002int skb_append_pagefrags(struct sk_buff *skb, struct page *page,
3003			 int offset, size_t size)
3004{
3005	int i = skb_shinfo(skb)->nr_frags;
3006
3007	if (skb_can_coalesce(skb, i, page, offset)) {
3008		skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size);
3009	} else if (i < MAX_SKB_FRAGS) {
3010		get_page(page);
3011		skb_fill_page_desc(skb, i, page, offset, size);
3012	} else {
3013		return -EMSGSIZE;
3014	}
3015
3016	return 0;
3017}
3018EXPORT_SYMBOL_GPL(skb_append_pagefrags);
3019
3020/**
3021 *	skb_pull_rcsum - pull skb and update receive checksum
3022 *	@skb: buffer to update
3023 *	@len: length of data pulled
3024 *
3025 *	This function performs an skb_pull on the packet and updates
3026 *	the CHECKSUM_COMPLETE checksum.  It should be used on
3027 *	receive path processing instead of skb_pull unless you know
3028 *	that the checksum difference is zero (e.g., a valid IP header)
3029 *	or you are setting ip_summed to CHECKSUM_NONE.
3030 */
3031unsigned char *skb_pull_rcsum(struct sk_buff *skb, unsigned int len)
3032{
3033	unsigned char *data = skb->data;
3034
3035	BUG_ON(len > skb->len);
3036	__skb_pull(skb, len);
3037	skb_postpull_rcsum(skb, data, len);
3038	return skb->data;
3039}
3040EXPORT_SYMBOL_GPL(skb_pull_rcsum);
3041
3042/**
3043 *	skb_segment - Perform protocol segmentation on skb.
3044 *	@head_skb: buffer to segment
3045 *	@features: features for the output path (see dev->features)
3046 *
3047 *	This function performs segmentation on the given skb.  It returns
3048 *	a pointer to the first in a list of new skbs for the segments.
3049 *	In case of error it returns ERR_PTR(err).
3050 */
3051struct sk_buff *skb_segment(struct sk_buff *head_skb,
3052			    netdev_features_t features)
3053{
3054	struct sk_buff *segs = NULL;
3055	struct sk_buff *tail = NULL;
3056	struct sk_buff *list_skb = skb_shinfo(head_skb)->frag_list;
3057	skb_frag_t *frag = skb_shinfo(head_skb)->frags;
3058	unsigned int mss = skb_shinfo(head_skb)->gso_size;
3059	unsigned int doffset = head_skb->data - skb_mac_header(head_skb);
3060	struct sk_buff *frag_skb = head_skb;
3061	unsigned int offset = doffset;
3062	unsigned int tnl_hlen = skb_tnl_header_len(head_skb);
3063	unsigned int partial_segs = 0;
3064	unsigned int headroom;
3065	unsigned int len = head_skb->len;
3066	__be16 proto;
3067	bool csum, sg;
3068	int nfrags = skb_shinfo(head_skb)->nr_frags;
3069	int err = -ENOMEM;
3070	int i = 0;
3071	int pos;
3072	int dummy;
3073
3074	__skb_push(head_skb, doffset);
3075	proto = skb_network_protocol(head_skb, &dummy);
3076	if (unlikely(!proto))
3077		return ERR_PTR(-EINVAL);
3078
3079	sg = !!(features & NETIF_F_SG);
3080	csum = !!can_checksum_protocol(features, proto);
3081
3082	if (sg && csum && (mss != GSO_BY_FRAGS))  {
3083		if (!(features & NETIF_F_GSO_PARTIAL)) {
3084			struct sk_buff *iter;
3085			unsigned int frag_len;
3086
3087			if (!list_skb ||
3088			    !net_gso_ok(features, skb_shinfo(head_skb)->gso_type))
3089				goto normal;
3090
3091			/* If we get here then all the required
3092			 * GSO features except frag_list are supported.
3093			 * Try to split the SKB to multiple GSO SKBs
3094			 * with no frag_list.
3095			 * Currently we can do that only when the buffers don't
3096			 * have a linear part and all the buffers except
3097			 * the last are of the same length.
3098			 */
3099			frag_len = list_skb->len;
3100			skb_walk_frags(head_skb, iter) {
3101				if (frag_len != iter->len && iter->next)
3102					goto normal;
3103				if (skb_headlen(iter))
3104					goto normal;
3105
3106				len -= iter->len;
3107			}
3108
3109			if (len != frag_len)
3110				goto normal;
3111		}
3112
3113		/* GSO partial only requires that we trim off any excess that
3114		 * doesn't fit into an MSS sized block, so take care of that
3115		 * now.
3116		 */
3117		partial_segs = len / mss;
3118		if (partial_segs > 1)
3119			mss *= partial_segs;
3120		else
3121			partial_segs = 0;
3122	}
3123
3124normal:
3125	headroom = skb_headroom(head_skb);
3126	pos = skb_headlen(head_skb);
3127
3128	do {
3129		struct sk_buff *nskb;
3130		skb_frag_t *nskb_frag;
3131		int hsize;
3132		int size;
3133
3134		if (unlikely(mss == GSO_BY_FRAGS)) {
3135			len = list_skb->len;
3136		} else {
3137			len = head_skb->len - offset;
3138			if (len > mss)
3139				len = mss;
3140		}
3141
3142		hsize = skb_headlen(head_skb) - offset;
3143		if (hsize < 0)
3144			hsize = 0;
3145		if (hsize > len || !sg)
3146			hsize = len;
3147
3148		if (!hsize && i >= nfrags && skb_headlen(list_skb) &&
3149		    (skb_headlen(list_skb) == len || sg)) {
3150			BUG_ON(skb_headlen(list_skb) > len);
3151
3152			i = 0;
3153			nfrags = skb_shinfo(list_skb)->nr_frags;
3154			frag = skb_shinfo(list_skb)->frags;
3155			frag_skb = list_skb;
3156			pos += skb_headlen(list_skb);
3157
3158			while (pos < offset + len) {
3159				BUG_ON(i >= nfrags);
3160
3161				size = skb_frag_size(frag);
3162				if (pos + size > offset + len)
3163					break;
3164
3165				i++;
3166				pos += size;
3167				frag++;
3168			}
3169
3170			nskb = skb_clone(list_skb, GFP_ATOMIC);
3171			list_skb = list_skb->next;
3172
3173			if (unlikely(!nskb))
3174				goto err;
3175
3176			if (unlikely(pskb_trim(nskb, len))) {
3177				kfree_skb(nskb);
3178				goto err;
3179			}
3180
3181			hsize = skb_end_offset(nskb);
3182			if (skb_cow_head(nskb, doffset + headroom)) {
3183				kfree_skb(nskb);
3184				goto err;
3185			}
3186
3187			nskb->truesize += skb_end_offset(nskb) - hsize;
3188			skb_release_head_state(nskb);
3189			__skb_push(nskb, doffset);
3190		} else {
3191			nskb = __alloc_skb(hsize + doffset + headroom,
3192					   GFP_ATOMIC, skb_alloc_rx_flag(head_skb),
3193					   NUMA_NO_NODE);
3194
3195			if (unlikely(!nskb))
3196				goto err;
3197
3198			skb_reserve(nskb, headroom);
3199			__skb_put(nskb, doffset);
3200		}
3201
3202		if (segs)
3203			tail->next = nskb;
3204		else
3205			segs = nskb;
3206		tail = nskb;
3207
3208		__copy_skb_header(nskb, head_skb);
3209
3210		skb_headers_offset_update(nskb, skb_headroom(nskb) - headroom);
3211		skb_reset_mac_len(nskb);
3212
3213		skb_copy_from_linear_data_offset(head_skb, -tnl_hlen,
3214						 nskb->data - tnl_hlen,
3215						 doffset + tnl_hlen);
3216
3217		if (nskb->len == len + doffset)
3218			goto perform_csum_check;
3219
3220		if (!sg) {
3221			if (!nskb->remcsum_offload)
3222				nskb->ip_summed = CHECKSUM_NONE;
3223			SKB_GSO_CB(nskb)->csum =
3224				skb_copy_and_csum_bits(head_skb, offset,
3225						       skb_put(nskb, len),
3226						       len, 0);
3227			SKB_GSO_CB(nskb)->csum_start =
3228				skb_headroom(nskb) + doffset;
3229			continue;
3230		}
3231
3232		nskb_frag = skb_shinfo(nskb)->frags;
3233
3234		skb_copy_from_linear_data_offset(head_skb, offset,
3235						 skb_put(nskb, hsize), hsize);
3236
3237		skb_shinfo(nskb)->tx_flags = skb_shinfo(head_skb)->tx_flags &
3238			SKBTX_SHARED_FRAG;
3239
3240		while (pos < offset + len) {
3241			if (i >= nfrags) {
3242				BUG_ON(skb_headlen(list_skb));
3243
3244				i = 0;
3245				nfrags = skb_shinfo(list_skb)->nr_frags;
3246				frag = skb_shinfo(list_skb)->frags;
3247				frag_skb = list_skb;
3248
3249				BUG_ON(!nfrags);
3250
3251				list_skb = list_skb->next;
3252			}
3253
3254			if (unlikely(skb_shinfo(nskb)->nr_frags >=
3255				     MAX_SKB_FRAGS)) {
3256				net_warn_ratelimited(
3257					"skb_segment: too many frags: %u %u\n",
3258					pos, mss);
3259				goto err;
3260			}
3261
3262			if (unlikely(skb_orphan_frags(frag_skb, GFP_ATOMIC)))
3263				goto err;
3264
3265			*nskb_frag = *frag;
3266			__skb_frag_ref(nskb_frag);
3267			size = skb_frag_size(nskb_frag);
3268
3269			if (pos < offset) {
3270				nskb_frag->page_offset += offset - pos;
3271				skb_frag_size_sub(nskb_frag, offset - pos);
3272			}
3273
3274			skb_shinfo(nskb)->nr_frags++;
3275
3276			if (pos + size <= offset + len) {
3277				i++;
3278				frag++;
3279				pos += size;
3280			} else {
3281				skb_frag_size_sub(nskb_frag, pos + size - (offset + len));
3282				goto skip_fraglist;
3283			}
3284
3285			nskb_frag++;
3286		}
3287
3288skip_fraglist:
3289		nskb->data_len = len - hsize;
3290		nskb->len += nskb->data_len;
3291		nskb->truesize += nskb->data_len;
3292
3293perform_csum_check:
3294		if (!csum) {
3295			if (skb_has_shared_frag(nskb)) {
3296				err = __skb_linearize(nskb);
3297				if (err)
3298					goto err;
3299			}
3300			if (!nskb->remcsum_offload)
3301				nskb->ip_summed = CHECKSUM_NONE;
3302			SKB_GSO_CB(nskb)->csum =
3303				skb_checksum(nskb, doffset,
3304					     nskb->len - doffset, 0);
3305			SKB_GSO_CB(nskb)->csum_start =
3306				skb_headroom(nskb) + doffset;
3307		}
3308	} while ((offset += len) < head_skb->len);
3309
3310	/* Some callers want to get the end of the list.
3311	 * Put it in segs->prev to avoid walking the list.
3312	 * (see validate_xmit_skb_list() for example)
3313	 */
3314	segs->prev = tail;
3315
3316	if (partial_segs) {
3317		struct sk_buff *iter;
3318		int type = skb_shinfo(head_skb)->gso_type;
3319		unsigned short gso_size = skb_shinfo(head_skb)->gso_size;
3320
3321		/* Update type to add partial and then remove dodgy if set */
3322		type |= (features & NETIF_F_GSO_PARTIAL) / NETIF_F_GSO_PARTIAL * SKB_GSO_PARTIAL;
3323		type &= ~SKB_GSO_DODGY;
3324
3325		/* Update GSO info and prepare to start updating headers on
3326		 * our way back down the stack of protocols.
3327		 */
3328		for (iter = segs; iter; iter = iter->next) {
3329			skb_shinfo(iter)->gso_size = gso_size;
3330			skb_shinfo(iter)->gso_segs = partial_segs;
3331			skb_shinfo(iter)->gso_type = type;
3332			SKB_GSO_CB(iter)->data_offset = skb_headroom(iter) + doffset;
3333		}
3334
3335		if (tail->len - doffset <= gso_size)
3336			skb_shinfo(tail)->gso_size = 0;
3337		else if (tail != segs)
3338			skb_shinfo(tail)->gso_segs = DIV_ROUND_UP(tail->len - doffset, gso_size);
3339	}
3340
3341	/* Following permits correct backpressure, for protocols
3342	 * using skb_set_owner_w().
3343	 * Idea is to tranfert ownership from head_skb to last segment.
3344	 */
3345	if (head_skb->destructor == sock_wfree) {
3346		swap(tail->truesize, head_skb->truesize);
3347		swap(tail->destructor, head_skb->destructor);
3348		swap(tail->sk, head_skb->sk);
3349	}
3350	return segs;
3351
3352err:
3353	kfree_skb_list(segs);
3354	return ERR_PTR(err);
3355}
3356EXPORT_SYMBOL_GPL(skb_segment);
3357
3358int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
3359{
3360	struct skb_shared_info *pinfo, *skbinfo = skb_shinfo(skb);
3361	unsigned int offset = skb_gro_offset(skb);
3362	unsigned int headlen = skb_headlen(skb);
3363	unsigned int len = skb_gro_len(skb);
3364	struct sk_buff *lp, *p = *head;
3365	unsigned int delta_truesize;
3366
3367	if (unlikely(p->len + len >= 65536))
3368		return -E2BIG;
3369
3370	lp = NAPI_GRO_CB(p)->last;
3371	pinfo = skb_shinfo(lp);
3372
3373	if (headlen <= offset) {
3374		skb_frag_t *frag;
3375		skb_frag_t *frag2;
3376		int i = skbinfo->nr_frags;
3377		int nr_frags = pinfo->nr_frags + i;
3378
3379		if (nr_frags > MAX_SKB_FRAGS)
3380			goto merge;
3381
3382		offset -= headlen;
3383		pinfo->nr_frags = nr_frags;
3384		skbinfo->nr_frags = 0;
3385
3386		frag = pinfo->frags + nr_frags;
3387		frag2 = skbinfo->frags + i;
3388		do {
3389			*--frag = *--frag2;
3390		} while (--i);
3391
3392		frag->page_offset += offset;
3393		skb_frag_size_sub(frag, offset);
3394
3395		/* all fragments truesize : remove (head size + sk_buff) */
3396		delta_truesize = skb->truesize -
3397				 SKB_TRUESIZE(skb_end_offset(skb));
3398
3399		skb->truesize -= skb->data_len;
3400		skb->len -= skb->data_len;
3401		skb->data_len = 0;
3402
3403		NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE;
3404		goto done;
3405	} else if (skb->head_frag) {
3406		int nr_frags = pinfo->nr_frags;
3407		skb_frag_t *frag = pinfo->frags + nr_frags;
3408		struct page *page = virt_to_head_page(skb->head);
3409		unsigned int first_size = headlen - offset;
3410		unsigned int first_offset;
3411
3412		if (nr_frags + 1 + skbinfo->nr_frags > MAX_SKB_FRAGS)
3413			goto merge;
3414
3415		first_offset = skb->data -
3416			       (unsigned char *)page_address(page) +
3417			       offset;
3418
3419		pinfo->nr_frags = nr_frags + 1 + skbinfo->nr_frags;
3420
3421		frag->page.p	  = page;
3422		frag->page_offset = first_offset;
3423		skb_frag_size_set(frag, first_size);
3424
3425		memcpy(frag + 1, skbinfo->frags, sizeof(*frag) * skbinfo->nr_frags);
3426		/* We dont need to clear skbinfo->nr_frags here */
3427
3428		delta_truesize = skb->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff));
3429		NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD;
3430		goto done;
3431	}
3432
3433merge:
3434	delta_truesize = skb->truesize;
3435	if (offset > headlen) {
3436		unsigned int eat = offset - headlen;
3437
3438		skbinfo->frags[0].page_offset += eat;
3439		skb_frag_size_sub(&skbinfo->frags[0], eat);
3440		skb->data_len -= eat;
3441		skb->len -= eat;
3442		offset = headlen;
3443	}
3444
3445	__skb_pull(skb, offset);
3446
3447	if (NAPI_GRO_CB(p)->last == p)
3448		skb_shinfo(p)->frag_list = skb;
3449	else
3450		NAPI_GRO_CB(p)->last->next = skb;
3451	NAPI_GRO_CB(p)->last = skb;
3452	__skb_header_release(skb);
3453	lp = p;
3454
3455done:
3456	NAPI_GRO_CB(p)->count++;
3457	p->data_len += len;
3458	p->truesize += delta_truesize;
3459	p->len += len;
3460	if (lp != p) {
3461		lp->data_len += len;
3462		lp->truesize += delta_truesize;
3463		lp->len += len;
3464	}
3465	NAPI_GRO_CB(skb)->same_flow = 1;
3466	return 0;
3467}
3468EXPORT_SYMBOL_GPL(skb_gro_receive);
3469
3470void __init skb_init(void)
3471{
3472	skbuff_head_cache = kmem_cache_create("skbuff_head_cache",
3473					      sizeof(struct sk_buff),
3474					      0,
3475					      SLAB_HWCACHE_ALIGN|SLAB_PANIC,
3476					      NULL);
3477	skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache",
3478						sizeof(struct sk_buff_fclones),
3479						0,
3480						SLAB_HWCACHE_ALIGN|SLAB_PANIC,
3481						NULL);
3482}
3483
3484/**
3485 *	skb_to_sgvec - Fill a scatter-gather list from a socket buffer
3486 *	@skb: Socket buffer containing the buffers to be mapped
3487 *	@sg: The scatter-gather list to map into
3488 *	@offset: The offset into the buffer's contents to start mapping
3489 *	@len: Length of buffer space to be mapped
3490 *
3491 *	Fill the specified scatter-gather list with mappings/pointers into a
3492 *	region of the buffer space attached to a socket buffer.
3493 */
3494static int
3495__skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)
3496{
3497	int start = skb_headlen(skb);
3498	int i, copy = start - offset;
3499	struct sk_buff *frag_iter;
3500	int elt = 0;
3501
3502	if (copy > 0) {
3503		if (copy > len)
3504			copy = len;
3505		sg_set_buf(sg, skb->data + offset, copy);
3506		elt++;
3507		if ((len -= copy) == 0)
3508			return elt;
3509		offset += copy;
3510	}
3511
3512	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
3513		int end;
3514
3515		WARN_ON(start > offset + len);
3516
3517		end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]);
3518		if ((copy = end - offset) > 0) {
3519			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
3520
3521			if (copy > len)
3522				copy = len;
3523			sg_set_page(&sg[elt], skb_frag_page(frag), copy,
3524					frag->page_offset+offset-start);
3525			elt++;
3526			if (!(len -= copy))
3527				return elt;
3528			offset += copy;
3529		}
3530		start = end;
3531	}
3532
3533	skb_walk_frags(skb, frag_iter) {
3534		int end;
3535
3536		WARN_ON(start > offset + len);
3537
3538		end = start + frag_iter->len;
3539		if ((copy = end - offset) > 0) {
3540			if (copy > len)
3541				copy = len;
3542			elt += __skb_to_sgvec(frag_iter, sg+elt, offset - start,
3543					      copy);
3544			if ((len -= copy) == 0)
3545				return elt;
3546			offset += copy;
3547		}
3548		start = end;
3549	}
3550	BUG_ON(len);
3551	return elt;
3552}
3553
3554/* As compared with skb_to_sgvec, skb_to_sgvec_nomark only map skb to given
3555 * sglist without mark the sg which contain last skb data as the end.
3556 * So the caller can mannipulate sg list as will when padding new data after
3557 * the first call without calling sg_unmark_end to expend sg list.
3558 *
3559 * Scenario to use skb_to_sgvec_nomark:
3560 * 1. sg_init_table
3561 * 2. skb_to_sgvec_nomark(payload1)
3562 * 3. skb_to_sgvec_nomark(payload2)
3563 *
3564 * This is equivalent to:
3565 * 1. sg_init_table
3566 * 2. skb_to_sgvec(payload1)
3567 * 3. sg_unmark_end
3568 * 4. skb_to_sgvec(payload2)
3569 *
3570 * When mapping mutilple payload conditionally, skb_to_sgvec_nomark
3571 * is more preferable.
3572 */
3573int skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg,
3574			int offset, int len)
3575{
3576	return __skb_to_sgvec(skb, sg, offset, len);
3577}
3578EXPORT_SYMBOL_GPL(skb_to_sgvec_nomark);
3579
3580int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)
3581{
3582	int nsg = __skb_to_sgvec(skb, sg, offset, len);
3583
3584	sg_mark_end(&sg[nsg - 1]);
3585
3586	return nsg;
3587}
3588EXPORT_SYMBOL_GPL(skb_to_sgvec);
3589
3590/**
3591 *	skb_cow_data - Check that a socket buffer's data buffers are writable
3592 *	@skb: The socket buffer to check.
3593 *	@tailbits: Amount of trailing space to be added
3594 *	@trailer: Returned pointer to the skb where the @tailbits space begins
3595 *
3596 *	Make sure that the data buffers attached to a socket buffer are
3597 *	writable. If they are not, private copies are made of the data buffers
3598 *	and the socket buffer is set to use these instead.
3599 *
3600 *	If @tailbits is given, make sure that there is space to write @tailbits
3601 *	bytes of data beyond current end of socket buffer.  @trailer will be
3602 *	set to point to the skb in which this space begins.
3603 *
3604 *	The number of scatterlist elements required to completely map the
3605 *	COW'd and extended socket buffer will be returned.
3606 */
3607int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer)
3608{
3609	int copyflag;
3610	int elt;
3611	struct sk_buff *skb1, **skb_p;
3612
3613	/* If skb is cloned or its head is paged, reallocate
3614	 * head pulling out all the pages (pages are considered not writable
3615	 * at the moment even if they are anonymous).
3616	 */
3617	if ((skb_cloned(skb) || skb_shinfo(skb)->nr_frags) &&
3618	    __pskb_pull_tail(skb, skb_pagelen(skb)-skb_headlen(skb)) == NULL)
3619		return -ENOMEM;
3620
3621	/* Easy case. Most of packets will go this way. */
3622	if (!skb_has_frag_list(skb)) {
3623		/* A little of trouble, not enough of space for trailer.
3624		 * This should not happen, when stack is tuned to generate
3625		 * good frames. OK, on miss we reallocate and reserve even more
3626		 * space, 128 bytes is fair. */
3627
3628		if (skb_tailroom(skb) < tailbits &&
3629		    pskb_expand_head(skb, 0, tailbits-skb_tailroom(skb)+128, GFP_ATOMIC))
3630			return -ENOMEM;
3631
3632		/* Voila! */
3633		*trailer = skb;
3634		return 1;
3635	}
3636
3637	/* Misery. We are in troubles, going to mincer fragments... */
3638
3639	elt = 1;
3640	skb_p = &skb_shinfo(skb)->frag_list;
3641	copyflag = 0;
3642
3643	while ((skb1 = *skb_p) != NULL) {
3644		int ntail = 0;
3645
3646		/* The fragment is partially pulled by someone,
3647		 * this can happen on input. Copy it and everything
3648		 * after it. */
3649
3650		if (skb_shared(skb1))
3651			copyflag = 1;
3652
3653		/* If the skb is the last, worry about trailer. */
3654
3655		if (skb1->next == NULL && tailbits) {
3656			if (skb_shinfo(skb1)->nr_frags ||
3657			    skb_has_frag_list(skb1) ||
3658			    skb_tailroom(skb1) < tailbits)
3659				ntail = tailbits + 128;
3660		}
3661
3662		if (copyflag ||
3663		    skb_cloned(skb1) ||
3664		    ntail ||
3665		    skb_shinfo(skb1)->nr_frags ||
3666		    skb_has_frag_list(skb1)) {
3667			struct sk_buff *skb2;
3668
3669			/* Fuck, we are miserable poor guys... */
3670			if (ntail == 0)
3671				skb2 = skb_copy(skb1, GFP_ATOMIC);
3672			else
3673				skb2 = skb_copy_expand(skb1,
3674						       skb_headroom(skb1),
3675						       ntail,
3676						       GFP_ATOMIC);
3677			if (unlikely(skb2 == NULL))
3678				return -ENOMEM;
3679
3680			if (skb1->sk)
3681				skb_set_owner_w(skb2, skb1->sk);
3682
3683			/* Looking around. Are we still alive?
3684			 * OK, link new skb, drop old one */
3685
3686			skb2->next = skb1->next;
3687			*skb_p = skb2;
3688			kfree_skb(skb1);
3689			skb1 = skb2;
3690		}
3691		elt++;
3692		*trailer = skb1;
3693		skb_p = &skb1->next;
3694	}
3695
3696	return elt;
3697}
3698EXPORT_SYMBOL_GPL(skb_cow_data);
3699
3700static void sock_rmem_free(struct sk_buff *skb)
3701{
3702	struct sock *sk = skb->sk;
3703
3704	atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
3705}
3706
3707static void skb_set_err_queue(struct sk_buff *skb)
3708{
3709	/* pkt_type of skbs received on local sockets is never PACKET_OUTGOING.
3710	 * So, it is safe to (mis)use it to mark skbs on the error queue.
3711	 */
3712	skb->pkt_type = PACKET_OUTGOING;
3713	BUILD_BUG_ON(PACKET_OUTGOING == 0);
3714}
3715
3716/*
3717 * Note: We dont mem charge error packets (no sk_forward_alloc changes)
3718 */
3719int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
3720{
3721	if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
3722	    (unsigned int)sk->sk_rcvbuf)
3723		return -ENOMEM;
3724
3725	skb_orphan(skb);
3726	skb->sk = sk;
3727	skb->destructor = sock_rmem_free;
3728	atomic_add(skb->truesize, &sk->sk_rmem_alloc);
3729	skb_set_err_queue(skb);
3730
3731	/* before exiting rcu section, make sure dst is refcounted */
3732	skb_dst_force(skb);
3733
3734	skb_queue_tail(&sk->sk_error_queue, skb);
3735	if (!sock_flag(sk, SOCK_DEAD))
3736		sk->sk_data_ready(sk);
3737	return 0;
3738}
3739EXPORT_SYMBOL(sock_queue_err_skb);
3740
3741static bool is_icmp_err_skb(const struct sk_buff *skb)
3742{
3743	return skb && (SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP ||
3744		       SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP6);
3745}
3746
3747struct sk_buff *sock_dequeue_err_skb(struct sock *sk)
3748{
3749	struct sk_buff_head *q = &sk->sk_error_queue;
3750	struct sk_buff *skb, *skb_next = NULL;
3751	bool icmp_next = false;
3752	unsigned long flags;
3753
3754	spin_lock_irqsave(&q->lock, flags);
3755	skb = __skb_dequeue(q);
3756	if (skb && (skb_next = skb_peek(q)))
3757		icmp_next = is_icmp_err_skb(skb_next);
3758	spin_unlock_irqrestore(&q->lock, flags);
3759
3760	if (is_icmp_err_skb(skb) && !icmp_next)
3761		sk->sk_err = 0;
3762
3763	if (skb_next)
3764		sk->sk_error_report(sk);
3765
3766	return skb;
3767}
3768EXPORT_SYMBOL(sock_dequeue_err_skb);
3769
3770/**
3771 * skb_clone_sk - create clone of skb, and take reference to socket
3772 * @skb: the skb to clone
3773 *
3774 * This function creates a clone of a buffer that holds a reference on
3775 * sk_refcnt.  Buffers created via this function are meant to be
3776 * returned using sock_queue_err_skb, or free via kfree_skb.
3777 *
3778 * When passing buffers allocated with this function to sock_queue_err_skb
3779 * it is necessary to wrap the call with sock_hold/sock_put in order to
3780 * prevent the socket from being released prior to being enqueued on
3781 * the sk_error_queue.
3782 */
3783struct sk_buff *skb_clone_sk(struct sk_buff *skb)
3784{
3785	struct sock *sk = skb->sk;
3786	struct sk_buff *clone;
3787
3788	if (!sk || !atomic_inc_not_zero(&sk->sk_refcnt))
3789		return NULL;
3790
3791	clone = skb_clone(skb, GFP_ATOMIC);
3792	if (!clone) {
3793		sock_put(sk);
3794		return NULL;
3795	}
3796
3797	clone->sk = sk;
3798	clone->destructor = sock_efree;
3799
3800	return clone;
3801}
3802EXPORT_SYMBOL(skb_clone_sk);
3803
3804static void __skb_complete_tx_timestamp(struct sk_buff *skb,
3805					struct sock *sk,
3806					int tstype,
3807					bool opt_stats)
3808{
3809	struct sock_exterr_skb *serr;
3810	int err;
3811
3812	BUILD_BUG_ON(sizeof(struct sock_exterr_skb) > sizeof(skb->cb));
3813
3814	serr = SKB_EXT_ERR(skb);
3815	memset(serr, 0, sizeof(*serr));
3816	serr->ee.ee_errno = ENOMSG;
3817	serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING;
3818	serr->ee.ee_info = tstype;
3819	serr->opt_stats = opt_stats;
3820	serr->header.h4.iif = skb->dev ? skb->dev->ifindex : 0;
3821	if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) {
3822		serr->ee.ee_data = skb_shinfo(skb)->tskey;
3823		if (sk->sk_protocol == IPPROTO_TCP &&
3824		    sk->sk_type == SOCK_STREAM)
3825			serr->ee.ee_data -= sk->sk_tskey;
3826	}
3827
3828	err = sock_queue_err_skb(sk, skb);
3829
3830	if (err)
3831		kfree_skb(skb);
3832}
3833
3834static bool skb_may_tx_timestamp(struct sock *sk, bool tsonly)
3835{
3836	bool ret;
3837
3838	if (likely(sysctl_tstamp_allow_data || tsonly))
3839		return true;
3840
3841	read_lock_bh(&sk->sk_callback_lock);
3842	ret = sk->sk_socket && sk->sk_socket->file &&
3843	      file_ns_capable(sk->sk_socket->file, &init_user_ns, CAP_NET_RAW);
3844	read_unlock_bh(&sk->sk_callback_lock);
3845	return ret;
3846}
3847
3848void skb_complete_tx_timestamp(struct sk_buff *skb,
3849			       struct skb_shared_hwtstamps *hwtstamps)
3850{
3851	struct sock *sk = skb->sk;
3852
3853	if (!skb_may_tx_timestamp(sk, false))
3854		return;
3855
3856	/* Take a reference to prevent skb_orphan() from freeing the socket,
3857	 * but only if the socket refcount is not zero.
3858	 */
3859	if (likely(atomic_inc_not_zero(&sk->sk_refcnt))) {
3860		*skb_hwtstamps(skb) = *hwtstamps;
3861		__skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND, false);
3862		sock_put(sk);
3863	}
3864}
3865EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
3866
3867void __skb_tstamp_tx(struct sk_buff *orig_skb,
3868		     struct skb_shared_hwtstamps *hwtstamps,
3869		     struct sock *sk, int tstype)
3870{
3871	struct sk_buff *skb;
3872	bool tsonly, opt_stats = false;
3873
3874	if (!sk)
3875		return;
3876
3877	tsonly = sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TSONLY;
3878	if (!skb_may_tx_timestamp(sk, tsonly))
3879		return;
3880
3881	if (tsonly) {
3882#ifdef CONFIG_INET
3883		if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS) &&
3884		    sk->sk_protocol == IPPROTO_TCP &&
3885		    sk->sk_type == SOCK_STREAM) {
3886			skb = tcp_get_timestamping_opt_stats(sk);
3887			opt_stats = true;
3888		} else
3889#endif
3890			skb = alloc_skb(0, GFP_ATOMIC);
3891	} else {
3892		skb = skb_clone(orig_skb, GFP_ATOMIC);
3893	}
3894	if (!skb)
3895		return;
3896
3897	if (tsonly) {
3898		skb_shinfo(skb)->tx_flags = skb_shinfo(orig_skb)->tx_flags;
3899		skb_shinfo(skb)->tskey = skb_shinfo(orig_skb)->tskey;
3900	}
3901
3902	if (hwtstamps)
3903		*skb_hwtstamps(skb) = *hwtstamps;
3904	else
3905		skb->tstamp = ktime_get_real();
3906
3907	__skb_complete_tx_timestamp(skb, sk, tstype, opt_stats);
3908}
3909EXPORT_SYMBOL_GPL(__skb_tstamp_tx);
3910
3911void skb_tstamp_tx(struct sk_buff *orig_skb,
3912		   struct skb_shared_hwtstamps *hwtstamps)
3913{
3914	return __skb_tstamp_tx(orig_skb, hwtstamps, orig_skb->sk,
3915			       SCM_TSTAMP_SND);
3916}
3917EXPORT_SYMBOL_GPL(skb_tstamp_tx);
3918
3919void skb_complete_wifi_ack(struct sk_buff *skb, bool acked)
3920{
3921	struct sock *sk = skb->sk;
3922	struct sock_exterr_skb *serr;
3923	int err = 1;
3924
3925	skb->wifi_acked_valid = 1;
3926	skb->wifi_acked = acked;
3927
3928	serr = SKB_EXT_ERR(skb);
3929	memset(serr, 0, sizeof(*serr));
3930	serr->ee.ee_errno = ENOMSG;
3931	serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS;
3932
3933	/* Take a reference to prevent skb_orphan() from freeing the socket,
3934	 * but only if the socket refcount is not zero.
3935	 */
3936	if (likely(atomic_inc_not_zero(&sk->sk_refcnt))) {
3937		err = sock_queue_err_skb(sk, skb);
3938		sock_put(sk);
3939	}
3940	if (err)
3941		kfree_skb(skb);
3942}
3943EXPORT_SYMBOL_GPL(skb_complete_wifi_ack);
3944
3945/**
3946 * skb_partial_csum_set - set up and verify partial csum values for packet
3947 * @skb: the skb to set
3948 * @start: the number of bytes after skb->data to start checksumming.
3949 * @off: the offset from start to place the checksum.
3950 *
3951 * For untrusted partially-checksummed packets, we need to make sure the values
3952 * for skb->csum_start and skb->csum_offset are valid so we don't oops.
3953 *
3954 * This function checks and sets those values and skb->ip_summed: if this
3955 * returns false you should drop the packet.
3956 */
3957bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off)
3958{
3959	if (unlikely(start > skb_headlen(skb)) ||
3960	    unlikely((int)start + off > skb_headlen(skb) - 2)) {
3961		net_warn_ratelimited("bad partial csum: csum=%u/%u len=%u\n",
3962				     start, off, skb_headlen(skb));
3963		return false;
3964	}
3965	skb->ip_summed = CHECKSUM_PARTIAL;
3966	skb->csum_start = skb_headroom(skb) + start;
3967	skb->csum_offset = off;
3968	skb_set_transport_header(skb, start);
3969	return true;
3970}
3971EXPORT_SYMBOL_GPL(skb_partial_csum_set);
3972
3973static int skb_maybe_pull_tail(struct sk_buff *skb, unsigned int len,
3974			       unsigned int max)
3975{
3976	if (skb_headlen(skb) >= len)
3977		return 0;
3978
3979	/* If we need to pullup then pullup to the max, so we
3980	 * won't need to do it again.
3981	 */
3982	if (max > skb->len)
3983		max = skb->len;
3984
3985	if (__pskb_pull_tail(skb, max - skb_headlen(skb)) == NULL)
3986		return -ENOMEM;
3987
3988	if (skb_headlen(skb) < len)
3989		return -EPROTO;
3990
3991	return 0;
3992}
3993
3994#define MAX_TCP_HDR_LEN (15 * 4)
3995
3996static __sum16 *skb_checksum_setup_ip(struct sk_buff *skb,
3997				      typeof(IPPROTO_IP) proto,
3998				      unsigned int off)
3999{
4000	switch (proto) {
4001		int err;
4002
4003	case IPPROTO_TCP:
4004		err = skb_maybe_pull_tail(skb, off + sizeof(struct tcphdr),
4005					  off + MAX_TCP_HDR_LEN);
4006		if (!err && !skb_partial_csum_set(skb, off,
4007						  offsetof(struct tcphdr,
4008							   check)))
4009			err = -EPROTO;
4010		return err ? ERR_PTR(err) : &tcp_hdr(skb)->check;
4011
4012	case IPPROTO_UDP:
4013		err = skb_maybe_pull_tail(skb, off + sizeof(struct udphdr),
4014					  off + sizeof(struct udphdr));
4015		if (!err && !skb_partial_csum_set(skb, off,
4016						  offsetof(struct udphdr,
4017							   check)))
4018			err = -EPROTO;
4019		return err ? ERR_PTR(err) : &udp_hdr(skb)->check;
4020	}
4021
4022	return ERR_PTR(-EPROTO);
4023}
4024
4025/* This value should be large enough to cover a tagged ethernet header plus
4026 * maximally sized IP and TCP or UDP headers.
4027 */
4028#define MAX_IP_HDR_LEN 128
4029
4030static int skb_checksum_setup_ipv4(struct sk_buff *skb, bool recalculate)
4031{
4032	unsigned int off;
4033	bool fragment;
4034	__sum16 *csum;
4035	int err;
4036
4037	fragment = false;
4038
4039	err = skb_maybe_pull_tail(skb,
4040				  sizeof(struct iphdr),
4041				  MAX_IP_HDR_LEN);
4042	if (err < 0)
4043		goto out;
4044
4045	if (ip_hdr(skb)->frag_off & htons(IP_OFFSET | IP_MF))
4046		fragment = true;
4047
4048	off = ip_hdrlen(skb);
4049
4050	err = -EPROTO;
4051
4052	if (fragment)
4053		goto out;
4054
4055	csum = skb_checksum_setup_ip(skb, ip_hdr(skb)->protocol, off);
4056	if (IS_ERR(csum))
4057		return PTR_ERR(csum);
4058
4059	if (recalculate)
4060		*csum = ~csum_tcpudp_magic(ip_hdr(skb)->saddr,
4061					   ip_hdr(skb)->daddr,
4062					   skb->len - off,
4063					   ip_hdr(skb)->protocol, 0);
4064	err = 0;
4065
4066out:
4067	return err;
4068}
4069
4070/* This value should be large enough to cover a tagged ethernet header plus
4071 * an IPv6 header, all options, and a maximal TCP or UDP header.
4072 */
4073#define MAX_IPV6_HDR_LEN 256
4074
4075#define OPT_HDR(type, skb, off) \
4076	(type *)(skb_network_header(skb) + (off))
4077
4078static int skb_checksum_setup_ipv6(struct sk_buff *skb, bool recalculate)
4079{
4080	int err;
4081	u8 nexthdr;
4082	unsigned int off;
4083	unsigned int len;
4084	bool fragment;
4085	bool done;
4086	__sum16 *csum;
4087
4088	fragment = false;
4089	done = false;
4090
4091	off = sizeof(struct ipv6hdr);
4092
4093	err = skb_maybe_pull_tail(skb, off, MAX_IPV6_HDR_LEN);
4094	if (err < 0)
4095		goto out;
4096
4097	nexthdr = ipv6_hdr(skb)->nexthdr;
4098
4099	len = sizeof(struct ipv6hdr) + ntohs(ipv6_hdr(skb)->payload_len);
4100	while (off <= len && !done) {
4101		switch (nexthdr) {
4102		case IPPROTO_DSTOPTS:
4103		case IPPROTO_HOPOPTS:
4104		case IPPROTO_ROUTING: {
4105			struct ipv6_opt_hdr *hp;
4106
4107			err = skb_maybe_pull_tail(skb,
4108						  off +
4109						  sizeof(struct ipv6_opt_hdr),
4110						  MAX_IPV6_HDR_LEN);
4111			if (err < 0)
4112				goto out;
4113
4114			hp = OPT_HDR(struct ipv6_opt_hdr, skb, off);
4115			nexthdr = hp->nexthdr;
4116			off += ipv6_optlen(hp);
4117			break;
4118		}
4119		case IPPROTO_AH: {
4120			struct ip_auth_hdr *hp;
4121
4122			err = skb_maybe_pull_tail(skb,
4123						  off +
4124						  sizeof(struct ip_auth_hdr),
4125						  MAX_IPV6_HDR_LEN);
4126			if (err < 0)
4127				goto out;
4128
4129			hp = OPT_HDR(struct ip_auth_hdr, skb, off);
4130			nexthdr = hp->nexthdr;
4131			off += ipv6_authlen(hp);
4132			break;
4133		}
4134		case IPPROTO_FRAGMENT: {
4135			struct frag_hdr *hp;
4136
4137			err = skb_maybe_pull_tail(skb,
4138						  off +
4139						  sizeof(struct frag_hdr),
4140						  MAX_IPV6_HDR_LEN);
4141			if (err < 0)
4142				goto out;
4143
4144			hp = OPT_HDR(struct frag_hdr, skb, off);
4145
4146			if (hp->frag_off & htons(IP6_OFFSET | IP6_MF))
4147				fragment = true;
4148
4149			nexthdr = hp->nexthdr;
4150			off += sizeof(struct frag_hdr);
4151			break;
4152		}
4153		default:
4154			done = true;
4155			break;
4156		}
4157	}
4158
4159	err = -EPROTO;
4160
4161	if (!done || fragment)
4162		goto out;
4163
4164	csum = skb_checksum_setup_ip(skb, nexthdr, off);
4165	if (IS_ERR(csum))
4166		return PTR_ERR(csum);
4167
4168	if (recalculate)
4169		*csum = ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
4170					 &ipv6_hdr(skb)->daddr,
4171					 skb->len - off, nexthdr, 0);
4172	err = 0;
4173
4174out:
4175	return err;
4176}
4177
4178/**
4179 * skb_checksum_setup - set up partial checksum offset
4180 * @skb: the skb to set up
4181 * @recalculate: if true the pseudo-header checksum will be recalculated
4182 */
4183int skb_checksum_setup(struct sk_buff *skb, bool recalculate)
4184{
4185	int err;
4186
4187	switch (skb->protocol) {
4188	case htons(ETH_P_IP):
4189		err = skb_checksum_setup_ipv4(skb, recalculate);
4190		break;
4191
4192	case htons(ETH_P_IPV6):
4193		err = skb_checksum_setup_ipv6(skb, recalculate);
4194		break;
4195
4196	default:
4197		err = -EPROTO;
4198		break;
4199	}
4200
4201	return err;
4202}
4203EXPORT_SYMBOL(skb_checksum_setup);
4204
4205/**
4206 * skb_checksum_maybe_trim - maybe trims the given skb
4207 * @skb: the skb to check
4208 * @transport_len: the data length beyond the network header
4209 *
4210 * Checks whether the given skb has data beyond the given transport length.
4211 * If so, returns a cloned skb trimmed to this transport length.
4212 * Otherwise returns the provided skb. Returns NULL in error cases
4213 * (e.g. transport_len exceeds skb length or out-of-memory).
4214 *
4215 * Caller needs to set the skb transport header and free any returned skb if it
4216 * differs from the provided skb.
4217 */
4218static struct sk_buff *skb_checksum_maybe_trim(struct sk_buff *skb,
4219					       unsigned int transport_len)
4220{
4221	struct sk_buff *skb_chk;
4222	unsigned int len = skb_transport_offset(skb) + transport_len;
4223	int ret;
4224
4225	if (skb->len < len)
4226		return NULL;
4227	else if (skb->len == len)
4228		return skb;
4229
4230	skb_chk = skb_clone(skb, GFP_ATOMIC);
4231	if (!skb_chk)
4232		return NULL;
4233
4234	ret = pskb_trim_rcsum(skb_chk, len);
4235	if (ret) {
4236		kfree_skb(skb_chk);
4237		return NULL;
4238	}
4239
4240	return skb_chk;
4241}
4242
4243/**
4244 * skb_checksum_trimmed - validate checksum of an skb
4245 * @skb: the skb to check
4246 * @transport_len: the data length beyond the network header
4247 * @skb_chkf: checksum function to use
4248 *
4249 * Applies the given checksum function skb_chkf to the provided skb.
4250 * Returns a checked and maybe trimmed skb. Returns NULL on error.
4251 *
4252 * If the skb has data beyond the given transport length, then a
4253 * trimmed & cloned skb is checked and returned.
4254 *
4255 * Caller needs to set the skb transport header and free any returned skb if it
4256 * differs from the provided skb.
4257 */
4258struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb,
4259				     unsigned int transport_len,
4260				     __sum16(*skb_chkf)(struct sk_buff *skb))
4261{
4262	struct sk_buff *skb_chk;
4263	unsigned int offset = skb_transport_offset(skb);
4264	__sum16 ret;
4265
4266	skb_chk = skb_checksum_maybe_trim(skb, transport_len);
4267	if (!skb_chk)
4268		goto err;
4269
4270	if (!pskb_may_pull(skb_chk, offset))
4271		goto err;
4272
4273	skb_pull_rcsum(skb_chk, offset);
4274	ret = skb_chkf(skb_chk);
4275	skb_push_rcsum(skb_chk, offset);
4276
4277	if (ret)
4278		goto err;
4279
4280	return skb_chk;
4281
4282err:
4283	if (skb_chk && skb_chk != skb)
4284		kfree_skb(skb_chk);
4285
4286	return NULL;
4287
4288}
4289EXPORT_SYMBOL(skb_checksum_trimmed);
4290
4291void __skb_warn_lro_forwarding(const struct sk_buff *skb)
4292{
4293	net_warn_ratelimited("%s: received packets cannot be forwarded while LRO is enabled\n",
4294			     skb->dev->name);
4295}
4296EXPORT_SYMBOL(__skb_warn_lro_forwarding);
4297
4298void kfree_skb_partial(struct sk_buff *skb, bool head_stolen)
4299{
4300	if (head_stolen) {
4301		skb_release_head_state(skb);
4302		kmem_cache_free(skbuff_head_cache, skb);
4303	} else {
4304		__kfree_skb(skb);
4305	}
4306}
4307EXPORT_SYMBOL(kfree_skb_partial);
4308
4309/**
4310 * skb_try_coalesce - try to merge skb to prior one
4311 * @to: prior buffer
4312 * @from: buffer to add
4313 * @fragstolen: pointer to boolean
4314 * @delta_truesize: how much more was allocated than was requested
4315 */
4316bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
4317		      bool *fragstolen, int *delta_truesize)
4318{
4319	int i, delta, len = from->len;
4320
4321	*fragstolen = false;
4322
4323	if (skb_cloned(to))
4324		return false;
4325
4326	if (len <= skb_tailroom(to)) {
4327		if (len)
4328			BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len));
4329		*delta_truesize = 0;
4330		return true;
4331	}
4332
4333	if (skb_has_frag_list(to) || skb_has_frag_list(from))
4334		return false;
4335
4336	if (skb_headlen(from) != 0) {
4337		struct page *page;
4338		unsigned int offset;
4339
4340		if (skb_shinfo(to)->nr_frags +
4341		    skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS)
4342			return false;
4343
4344		if (skb_head_is_locked(from))
4345			return false;
4346
4347		delta = from->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff));
4348
4349		page = virt_to_head_page(from->head);
4350		offset = from->data - (unsigned char *)page_address(page);
4351
4352		skb_fill_page_desc(to, skb_shinfo(to)->nr_frags,
4353				   page, offset, skb_headlen(from));
4354		*fragstolen = true;
4355	} else {
4356		if (skb_shinfo(to)->nr_frags +
4357		    skb_shinfo(from)->nr_frags > MAX_SKB_FRAGS)
4358			return false;
4359
4360		delta = from->truesize - SKB_TRUESIZE(skb_end_offset(from));
4361	}
4362
4363	WARN_ON_ONCE(delta < len);
4364
4365	memcpy(skb_shinfo(to)->frags + skb_shinfo(to)->nr_frags,
4366	       skb_shinfo(from)->frags,
4367	       skb_shinfo(from)->nr_frags * sizeof(skb_frag_t));
4368	skb_shinfo(to)->nr_frags += skb_shinfo(from)->nr_frags;
4369
4370	if (!skb_cloned(from))
4371		skb_shinfo(from)->nr_frags = 0;
4372
4373	/* if the skb is not cloned this does nothing
4374	 * since we set nr_frags to 0.
4375	 */
4376	for (i = 0; i < skb_shinfo(from)->nr_frags; i++)
4377		skb_frag_ref(from, i);
4378
4379	to->truesize += delta;
4380	to->len += len;
4381	to->data_len += len;
4382
4383	*delta_truesize = delta;
4384	return true;
4385}
4386EXPORT_SYMBOL(skb_try_coalesce);
4387
4388/**
4389 * skb_scrub_packet - scrub an skb
4390 *
4391 * @skb: buffer to clean
4392 * @xnet: packet is crossing netns
4393 *
4394 * skb_scrub_packet can be used after encapsulating or decapsulting a packet
4395 * into/from a tunnel. Some information have to be cleared during these
4396 * operations.
4397 * skb_scrub_packet can also be used to clean a skb before injecting it in
4398 * another namespace (@xnet == true). We have to clear all information in the
4399 * skb that could impact namespace isolation.
4400 */
4401void skb_scrub_packet(struct sk_buff *skb, bool xnet)
4402{
4403	skb->tstamp = 0;
4404	skb->pkt_type = PACKET_HOST;
4405	skb->skb_iif = 0;
4406	skb->ignore_df = 0;
4407	skb_dst_drop(skb);
4408	secpath_reset(skb);
4409	nf_reset(skb);
4410	nf_reset_trace(skb);
4411
4412	if (!xnet)
4413		return;
4414
4415	skb_orphan(skb);
4416	skb->mark = 0;
4417}
4418EXPORT_SYMBOL_GPL(skb_scrub_packet);
4419
4420/**
4421 * skb_gso_transport_seglen - Return length of individual segments of a gso packet
4422 *
4423 * @skb: GSO skb
4424 *
4425 * skb_gso_transport_seglen is used to determine the real size of the
4426 * individual segments, including Layer4 headers (TCP/UDP).
4427 *
4428 * The MAC/L2 or network (IP, IPv6) headers are not accounted for.
4429 */
4430unsigned int skb_gso_transport_seglen(const struct sk_buff *skb)
4431{
4432	const struct skb_shared_info *shinfo = skb_shinfo(skb);
4433	unsigned int thlen = 0;
4434
4435	if (skb->encapsulation) {
4436		thlen = skb_inner_transport_header(skb) -
4437			skb_transport_header(skb);
4438
4439		if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
4440			thlen += inner_tcp_hdrlen(skb);
4441	} else if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
4442		thlen = tcp_hdrlen(skb);
4443	} else if (unlikely(shinfo->gso_type & SKB_GSO_SCTP)) {
4444		thlen = sizeof(struct sctphdr);
4445	}
4446	/* UFO sets gso_size to the size of the fragmentation
4447	 * payload, i.e. the size of the L4 (UDP) header is already
4448	 * accounted for.
4449	 */
4450	return thlen + shinfo->gso_size;
4451}
4452EXPORT_SYMBOL_GPL(skb_gso_transport_seglen);
4453
4454/**
4455 * skb_gso_validate_mtu - Return in case such skb fits a given MTU
4456 *
4457 * @skb: GSO skb
4458 * @mtu: MTU to validate against
4459 *
4460 * skb_gso_validate_mtu validates if a given skb will fit a wanted MTU
4461 * once split.
4462 */
4463bool skb_gso_validate_mtu(const struct sk_buff *skb, unsigned int mtu)
4464{
4465	const struct skb_shared_info *shinfo = skb_shinfo(skb);
4466	const struct sk_buff *iter;
4467	unsigned int hlen;
4468
4469	hlen = skb_gso_network_seglen(skb);
4470
4471	if (shinfo->gso_size != GSO_BY_FRAGS)
4472		return hlen <= mtu;
4473
4474	/* Undo this so we can re-use header sizes */
4475	hlen -= GSO_BY_FRAGS;
4476
4477	skb_walk_frags(skb, iter) {
4478		if (hlen + skb_headlen(iter) > mtu)
4479			return false;
4480	}
4481
4482	return true;
4483}
4484EXPORT_SYMBOL_GPL(skb_gso_validate_mtu);
4485
4486static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb)
4487{
4488	if (skb_cow(skb, skb_headroom(skb)) < 0) {
4489		kfree_skb(skb);
4490		return NULL;
4491	}
4492
4493	memmove(skb->data - ETH_HLEN, skb->data - skb->mac_len - VLAN_HLEN,
4494		2 * ETH_ALEN);
4495	skb->mac_header += VLAN_HLEN;
4496	return skb;
4497}
4498
4499struct sk_buff *skb_vlan_untag(struct sk_buff *skb)
4500{
4501	struct vlan_hdr *vhdr;
4502	u16 vlan_tci;
4503
4504	if (unlikely(skb_vlan_tag_present(skb))) {
4505		/* vlan_tci is already set-up so leave this for another time */
4506		return skb;
4507	}
4508
4509	skb = skb_share_check(skb, GFP_ATOMIC);
4510	if (unlikely(!skb))
4511		goto err_free;
4512
4513	if (unlikely(!pskb_may_pull(skb, VLAN_HLEN)))
4514		goto err_free;
4515
4516	vhdr = (struct vlan_hdr *)skb->data;
4517	vlan_tci = ntohs(vhdr->h_vlan_TCI);
4518	__vlan_hwaccel_put_tag(skb, skb->protocol, vlan_tci);
4519
4520	skb_pull_rcsum(skb, VLAN_HLEN);
4521	vlan_set_encap_proto(skb, vhdr);
4522
4523	skb = skb_reorder_vlan_header(skb);
4524	if (unlikely(!skb))
4525		goto err_free;
4526
4527	skb_reset_network_header(skb);
4528	skb_reset_transport_header(skb);
4529	skb_reset_mac_len(skb);
4530
4531	return skb;
4532
4533err_free:
4534	kfree_skb(skb);
4535	return NULL;
4536}
4537EXPORT_SYMBOL(skb_vlan_untag);
4538
4539int skb_ensure_writable(struct sk_buff *skb, int write_len)
4540{
4541	if (!pskb_may_pull(skb, write_len))
4542		return -ENOMEM;
4543
4544	if (!skb_cloned(skb) || skb_clone_writable(skb, write_len))
4545		return 0;
4546
4547	return pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
4548}
4549EXPORT_SYMBOL(skb_ensure_writable);
4550
4551/* remove VLAN header from packet and update csum accordingly.
4552 * expects a non skb_vlan_tag_present skb with a vlan tag payload
4553 */
4554int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci)
4555{
4556	struct vlan_hdr *vhdr;
4557	int offset = skb->data - skb_mac_header(skb);
4558	int err;
4559
4560	if (WARN_ONCE(offset,
4561		      "__skb_vlan_pop got skb with skb->data not at mac header (offset %d)\n",
4562		      offset)) {
4563		return -EINVAL;
4564	}
4565
4566	err = skb_ensure_writable(skb, VLAN_ETH_HLEN);
4567	if (unlikely(err))
4568		return err;
4569
4570	skb_postpull_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN);
4571
4572	vhdr = (struct vlan_hdr *)(skb->data + ETH_HLEN);
4573	*vlan_tci = ntohs(vhdr->h_vlan_TCI);
4574
4575	memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN);
4576	__skb_pull(skb, VLAN_HLEN);
4577
4578	vlan_set_encap_proto(skb, vhdr);
4579	skb->mac_header += VLAN_HLEN;
4580
4581	if (skb_network_offset(skb) < ETH_HLEN)
4582		skb_set_network_header(skb, ETH_HLEN);
4583
4584	skb_reset_mac_len(skb);
4585
4586	return err;
4587}
4588EXPORT_SYMBOL(__skb_vlan_pop);
4589
4590/* Pop a vlan tag either from hwaccel or from payload.
4591 * Expects skb->data at mac header.
4592 */
4593int skb_vlan_pop(struct sk_buff *skb)
4594{
4595	u16 vlan_tci;
4596	__be16 vlan_proto;
4597	int err;
4598
4599	if (likely(skb_vlan_tag_present(skb))) {
4600		skb->vlan_tci = 0;
4601	} else {
4602		if (unlikely(!eth_type_vlan(skb->protocol)))
4603			return 0;
4604
4605		err = __skb_vlan_pop(skb, &vlan_tci);
4606		if (err)
4607			return err;
4608	}
4609	/* move next vlan tag to hw accel tag */
4610	if (likely(!eth_type_vlan(skb->protocol)))
4611		return 0;
4612
4613	vlan_proto = skb->protocol;
4614	err = __skb_vlan_pop(skb, &vlan_tci);
4615	if (unlikely(err))
4616		return err;
4617
4618	__vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci);
4619	return 0;
4620}
4621EXPORT_SYMBOL(skb_vlan_pop);
4622
4623/* Push a vlan tag either into hwaccel or into payload (if hwaccel tag present).
4624 * Expects skb->data at mac header.
4625 */
4626int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
4627{
4628	if (skb_vlan_tag_present(skb)) {
4629		int offset = skb->data - skb_mac_header(skb);
4630		int err;
4631
4632		if (WARN_ONCE(offset,
4633			      "skb_vlan_push got skb with skb->data not at mac header (offset %d)\n",
4634			      offset)) {
4635			return -EINVAL;
4636		}
4637
4638		err = __vlan_insert_tag(skb, skb->vlan_proto,
4639					skb_vlan_tag_get(skb));
4640		if (err)
4641			return err;
4642
4643		skb->protocol = skb->vlan_proto;
4644		skb->mac_len += VLAN_HLEN;
4645
4646		skb_postpush_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN);
4647	}
4648	__vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci);
4649	return 0;
4650}
4651EXPORT_SYMBOL(skb_vlan_push);
4652
4653/**
4654 * alloc_skb_with_frags - allocate skb with page frags
4655 *
4656 * @header_len: size of linear part
4657 * @data_len: needed length in frags
4658 * @max_page_order: max page order desired.
4659 * @errcode: pointer to error code if any
4660 * @gfp_mask: allocation mask
4661 *
4662 * This can be used to allocate a paged skb, given a maximal order for frags.
4663 */
4664struct sk_buff *alloc_skb_with_frags(unsigned long header_len,
4665				     unsigned long data_len,
4666				     int max_page_order,
4667				     int *errcode,
4668				     gfp_t gfp_mask)
4669{
4670	int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
4671	unsigned long chunk;
4672	struct sk_buff *skb;
4673	struct page *page;
4674	gfp_t gfp_head;
4675	int i;
4676
4677	*errcode = -EMSGSIZE;
4678	/* Note this test could be relaxed, if we succeed to allocate
4679	 * high order pages...
4680	 */
4681	if (npages > MAX_SKB_FRAGS)
4682		return NULL;
4683
4684	gfp_head = gfp_mask;
4685	if (gfp_head & __GFP_DIRECT_RECLAIM)
4686		gfp_head |= __GFP_REPEAT;
4687
4688	*errcode = -ENOBUFS;
4689	skb = alloc_skb(header_len, gfp_head);
4690	if (!skb)
4691		return NULL;
4692
4693	skb->truesize += npages << PAGE_SHIFT;
4694
4695	for (i = 0; npages > 0; i++) {
4696		int order = max_page_order;
4697
4698		while (order) {
4699			if (npages >= 1 << order) {
4700				page = alloc_pages((gfp_mask & ~__GFP_DIRECT_RECLAIM) |
4701						   __GFP_COMP |
4702						   __GFP_NOWARN |
4703						   __GFP_NORETRY,
4704						   order);
4705				if (page)
4706					goto fill_page;
4707				/* Do not retry other high order allocations */
4708				order = 1;
4709				max_page_order = 0;
4710			}
4711			order--;
4712		}
4713		page = alloc_page(gfp_mask);
4714		if (!page)
4715			goto failure;
4716fill_page:
4717		chunk = min_t(unsigned long, data_len,
4718			      PAGE_SIZE << order);
4719		skb_fill_page_desc(skb, i, page, 0, chunk);
4720		data_len -= chunk;
4721		npages -= 1 << order;
4722	}
4723	return skb;
4724
4725failure:
4726	kfree_skb(skb);
4727	return NULL;
4728}
4729EXPORT_SYMBOL(alloc_skb_with_frags);
4730
4731/* carve out the first off bytes from skb when off < headlen */
4732static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off,
4733				    const int headlen, gfp_t gfp_mask)
4734{
4735	int i;
4736	int size = skb_end_offset(skb);
4737	int new_hlen = headlen - off;
4738	u8 *data;
4739
4740	size = SKB_DATA_ALIGN(size);
4741
4742	if (skb_pfmemalloc(skb))
4743		gfp_mask |= __GFP_MEMALLOC;
4744	data = kmalloc_reserve(size +
4745			       SKB_DATA_ALIGN(sizeof(struct skb_shared_info)),
4746			       gfp_mask, NUMA_NO_NODE, NULL);
4747	if (!data)
4748		return -ENOMEM;
4749
4750	size = SKB_WITH_OVERHEAD(ksize(data));
4751
4752	/* Copy real data, and all frags */
4753	skb_copy_from_linear_data_offset(skb, off, data, new_hlen);
4754	skb->len -= off;
4755
4756	memcpy((struct skb_shared_info *)(data + size),
4757	       skb_shinfo(skb),
4758	       offsetof(struct skb_shared_info,
4759			frags[skb_shinfo(skb)->nr_frags]));
4760	if (skb_cloned(skb)) {
4761		/* drop the old head gracefully */
4762		if (skb_orphan_frags(skb, gfp_mask)) {
4763			kfree(data);
4764			return -ENOMEM;
4765		}
4766		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
4767			skb_frag_ref(skb, i);
4768		if (skb_has_frag_list(skb))
4769			skb_clone_fraglist(skb);
4770		skb_release_data(skb);
4771	} else {
4772		/* we can reuse existing recount- all we did was
4773		 * relocate values
4774		 */
4775		skb_free_head(skb);
4776	}
4777
4778	skb->head = data;
4779	skb->data = data;
4780	skb->head_frag = 0;
4781#ifdef NET_SKBUFF_DATA_USES_OFFSET
4782	skb->end = size;
4783#else
4784	skb->end = skb->head + size;
4785#endif
4786	skb_set_tail_pointer(skb, skb_headlen(skb));
4787	skb_headers_offset_update(skb, 0);
4788	skb->cloned = 0;
4789	skb->hdr_len = 0;
4790	skb->nohdr = 0;
4791	atomic_set(&skb_shinfo(skb)->dataref, 1);
4792
4793	return 0;
4794}
4795
4796static int pskb_carve(struct sk_buff *skb, const u32 off, gfp_t gfp);
4797
4798/* carve out the first eat bytes from skb's frag_list. May recurse into
4799 * pskb_carve()
4800 */
4801static int pskb_carve_frag_list(struct sk_buff *skb,
4802				struct skb_shared_info *shinfo, int eat,
4803				gfp_t gfp_mask)
4804{
4805	struct sk_buff *list = shinfo->frag_list;
4806	struct sk_buff *clone = NULL;
4807	struct sk_buff *insp = NULL;
4808
4809	do {
4810		if (!list) {
4811			pr_err("Not enough bytes to eat. Want %d\n", eat);
4812			return -EFAULT;
4813		}
4814		if (list->len <= eat) {
4815			/* Eaten as whole. */
4816			eat -= list->len;
4817			list = list->next;
4818			insp = list;
4819		} else {
4820			/* Eaten partially. */
4821			if (skb_shared(list)) {
4822				clone = skb_clone(list, gfp_mask);
4823				if (!clone)
4824					return -ENOMEM;
4825				insp = list->next;
4826				list = clone;
4827			} else {
4828				/* This may be pulled without problems. */
4829				insp = list;
4830			}
4831			if (pskb_carve(list, eat, gfp_mask) < 0) {
4832				kfree_skb(clone);
4833				return -ENOMEM;
4834			}
4835			break;
4836		}
4837	} while (eat);
4838
4839	/* Free pulled out fragments. */
4840	while ((list = shinfo->frag_list) != insp) {
4841		shinfo->frag_list = list->next;
4842		kfree_skb(list);
4843	}
4844	/* And insert new clone at head. */
4845	if (clone) {
4846		clone->next = list;
4847		shinfo->frag_list = clone;
4848	}
4849	return 0;
4850}
4851
4852/* carve off first len bytes from skb. Split line (off) is in the
4853 * non-linear part of skb
4854 */
4855static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
4856				       int pos, gfp_t gfp_mask)
4857{
4858	int i, k = 0;
4859	int size = skb_end_offset(skb);
4860	u8 *data;
4861	const int nfrags = skb_shinfo(skb)->nr_frags;
4862	struct skb_shared_info *shinfo;
4863
4864	size = SKB_DATA_ALIGN(size);
4865
4866	if (skb_pfmemalloc(skb))
4867		gfp_mask |= __GFP_MEMALLOC;
4868	data = kmalloc_reserve(size +
4869			       SKB_DATA_ALIGN(sizeof(struct skb_shared_info)),
4870			       gfp_mask, NUMA_NO_NODE, NULL);
4871	if (!data)
4872		return -ENOMEM;
4873
4874	size = SKB_WITH_OVERHEAD(ksize(data));
4875
4876	memcpy((struct skb_shared_info *)(data + size),
4877	       skb_shinfo(skb), offsetof(struct skb_shared_info,
4878					 frags[skb_shinfo(skb)->nr_frags]));
4879	if (skb_orphan_frags(skb, gfp_mask)) {
4880		kfree(data);
4881		return -ENOMEM;
4882	}
4883	shinfo = (struct skb_shared_info *)(data + size);
4884	for (i = 0; i < nfrags; i++) {
4885		int fsize = skb_frag_size(&skb_shinfo(skb)->frags[i]);
4886
4887		if (pos + fsize > off) {
4888			shinfo->frags[k] = skb_shinfo(skb)->frags[i];
4889
4890			if (pos < off) {
4891				/* Split frag.
4892				 * We have two variants in this case:
4893				 * 1. Move all the frag to the second
4894				 *    part, if it is possible. F.e.
4895				 *    this approach is mandatory for TUX,
4896				 *    where splitting is expensive.
4897				 * 2. Split is accurately. We make this.
4898				 */
4899				shinfo->frags[0].page_offset += off - pos;
4900				skb_frag_size_sub(&shinfo->frags[0], off - pos);
4901			}
4902			skb_frag_ref(skb, i);
4903			k++;
4904		}
4905		pos += fsize;
4906	}
4907	shinfo->nr_frags = k;
4908	if (skb_has_frag_list(skb))
4909		skb_clone_fraglist(skb);
4910
4911	if (k == 0) {
4912		/* split line is in frag list */
4913		pskb_carve_frag_list(skb, shinfo, off - pos, gfp_mask);
4914	}
4915	skb_release_data(skb);
4916
4917	skb->head = data;
4918	skb->head_frag = 0;
4919	skb->data = data;
4920#ifdef NET_SKBUFF_DATA_USES_OFFSET
4921	skb->end = size;
4922#else
4923	skb->end = skb->head + size;
4924#endif
4925	skb_reset_tail_pointer(skb);
4926	skb_headers_offset_update(skb, 0);
4927	skb->cloned   = 0;
4928	skb->hdr_len  = 0;
4929	skb->nohdr    = 0;
4930	skb->len -= off;
4931	skb->data_len = skb->len;
4932	atomic_set(&skb_shinfo(skb)->dataref, 1);
4933	return 0;
4934}
4935
4936/* remove len bytes from the beginning of the skb */
4937static int pskb_carve(struct sk_buff *skb, const u32 len, gfp_t gfp)
4938{
4939	int headlen = skb_headlen(skb);
4940
4941	if (len < headlen)
4942		return pskb_carve_inside_header(skb, len, headlen, gfp);
4943	else
4944		return pskb_carve_inside_nonlinear(skb, len, headlen, gfp);
4945}
4946
4947/* Extract to_copy bytes starting at off from skb, and return this in
4948 * a new skb
4949 */
4950struct sk_buff *pskb_extract(struct sk_buff *skb, int off,
4951			     int to_copy, gfp_t gfp)
4952{
4953	struct sk_buff  *clone = skb_clone(skb, gfp);
4954
4955	if (!clone)
4956		return NULL;
4957
4958	if (pskb_carve(clone, off, gfp) < 0 ||
4959	    pskb_trim(clone, to_copy)) {
4960		kfree_skb(clone);
4961		return NULL;
4962	}
4963	return clone;
4964}
4965EXPORT_SYMBOL(pskb_extract);
4966
4967/**
4968 * skb_condense - try to get rid of fragments/frag_list if possible
4969 * @skb: buffer
4970 *
4971 * Can be used to save memory before skb is added to a busy queue.
4972 * If packet has bytes in frags and enough tail room in skb->head,
4973 * pull all of them, so that we can free the frags right now and adjust
4974 * truesize.
4975 * Notes:
4976 *	We do not reallocate skb->head thus can not fail.
4977 *	Caller must re-evaluate skb->truesize if needed.
4978 */
4979void skb_condense(struct sk_buff *skb)
4980{
4981	if (skb->data_len) {
4982		if (skb->data_len > skb->end - skb->tail ||
4983		    skb_cloned(skb))
4984			return;
4985
4986		/* Nice, we can free page frag(s) right now */
4987		__pskb_pull_tail(skb, skb->data_len);
4988	}
4989	/* At this point, skb->truesize might be over estimated,
4990	 * because skb had a fragment, and fragments do not tell
4991	 * their truesize.
4992	 * When we pulled its content into skb->head, fragment
4993	 * was freed, but __pskb_pull_tail() could not possibly
4994	 * adjust skb->truesize, not knowing the frag truesize.
4995	 */
4996	skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4997}