net/core/skbuff.c at v5.11-rc7 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / net / core / skbuff.c
at v5.11-rc7 6382 lines 161 kB view raw
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 *	Routines having to do with the 'struct sk_buff' memory handlers.
   4 *
   5 *	Authors:	Alan Cox <alan@lxorguk.ukuu.org.uk>
   6 *			Florian La Roche <rzsfl@rz.uni-sb.de>
   7 *
   8 *	Fixes:
   9 *		Alan Cox	:	Fixed the worst of the load
  10 *					balancer bugs.
  11 *		Dave Platt	:	Interrupt stacking fix.
  12 *	Richard Kooijman	:	Timestamp fixes.
  13 *		Alan Cox	:	Changed buffer format.
  14 *		Alan Cox	:	destructor hook for AF_UNIX etc.
  15 *		Linus Torvalds	:	Better skb_clone.
  16 *		Alan Cox	:	Added skb_copy.
  17 *		Alan Cox	:	Added all the changed routines Linus
  18 *					only put in the headers
  19 *		Ray VanTassle	:	Fixed --skb->lock in free
  20 *		Alan Cox	:	skb_copy copy arp field
  21 *		Andi Kleen	:	slabified it.
  22 *		Robert Olsson	:	Removed skb_head_pool
  23 *
  24 *	NOTE:
  25 *		The __skb_ routines should be called with interrupts
  26 *	disabled, or you better be *real* sure that the operation is atomic
  27 *	with respect to whatever list is being frobbed (e.g. via lock_sock()
  28 *	or via disabling bottom half handlers, etc).
  29 */
  30
  31/*
  32 *	The functions in this file will not compile correctly with gcc 2.4.x
  33 */
  34
  35#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  36
  37#include <linux/module.h>
  38#include <linux/types.h>
  39#include <linux/kernel.h>
  40#include <linux/mm.h>
  41#include <linux/interrupt.h>
  42#include <linux/in.h>
  43#include <linux/inet.h>
  44#include <linux/slab.h>
  45#include <linux/tcp.h>
  46#include <linux/udp.h>
  47#include <linux/sctp.h>
  48#include <linux/netdevice.h>
  49#ifdef CONFIG_NET_CLS_ACT
  50#include <net/pkt_sched.h>
  51#endif
  52#include <linux/string.h>
  53#include <linux/skbuff.h>
  54#include <linux/splice.h>
  55#include <linux/cache.h>
  56#include <linux/rtnetlink.h>
  57#include <linux/init.h>
  58#include <linux/scatterlist.h>
  59#include <linux/errqueue.h>
  60#include <linux/prefetch.h>
  61#include <linux/if_vlan.h>
  62#include <linux/mpls.h>
  63
  64#include <net/protocol.h>
  65#include <net/dst.h>
  66#include <net/sock.h>
  67#include <net/checksum.h>
  68#include <net/ip6_checksum.h>
  69#include <net/xfrm.h>
  70#include <net/mpls.h>
  71#include <net/mptcp.h>
  72
  73#include <linux/uaccess.h>
  74#include <trace/events/skb.h>
  75#include <linux/highmem.h>
  76#include <linux/capability.h>
  77#include <linux/user_namespace.h>
  78#include <linux/indirect_call_wrapper.h>
  79
  80#include "datagram.h"
  81
  82struct kmem_cache *skbuff_head_cache __ro_after_init;
  83static struct kmem_cache *skbuff_fclone_cache __ro_after_init;
  84#ifdef CONFIG_SKB_EXTENSIONS
  85static struct kmem_cache *skbuff_ext_cache __ro_after_init;
  86#endif
  87int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS;
  88EXPORT_SYMBOL(sysctl_max_skb_frags);
  89
  90/**
  91 *	skb_panic - private function for out-of-line support
  92 *	@skb:	buffer
  93 *	@sz:	size
  94 *	@addr:	address
  95 *	@msg:	skb_over_panic or skb_under_panic
  96 *
  97 *	Out-of-line support for skb_put() and skb_push().
  98 *	Called via the wrapper skb_over_panic() or skb_under_panic().
  99 *	Keep out of line to prevent kernel bloat.
 100 *	__builtin_return_address is not used because it is not always reliable.
 101 */
 102static void skb_panic(struct sk_buff *skb, unsigned int sz, void *addr,
 103		      const char msg[])
 104{
 105	pr_emerg("%s: text:%px len:%d put:%d head:%px data:%px tail:%#lx end:%#lx dev:%s\n",
 106		 msg, addr, skb->len, sz, skb->head, skb->data,
 107		 (unsigned long)skb->tail, (unsigned long)skb->end,
 108		 skb->dev ? skb->dev->name : "<NULL>");
 109	BUG();
 110}
 111
 112static void skb_over_panic(struct sk_buff *skb, unsigned int sz, void *addr)
 113{
 114	skb_panic(skb, sz, addr, __func__);
 115}
 116
 117static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr)
 118{
 119	skb_panic(skb, sz, addr, __func__);
 120}
 121
 122/*
 123 * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells
 124 * the caller if emergency pfmemalloc reserves are being used. If it is and
 125 * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves
 126 * may be used. Otherwise, the packet data may be discarded until enough
 127 * memory is free
 128 */
 129#define kmalloc_reserve(size, gfp, node, pfmemalloc) \
 130	 __kmalloc_reserve(size, gfp, node, _RET_IP_, pfmemalloc)
 131
 132static void *__kmalloc_reserve(size_t size, gfp_t flags, int node,
 133			       unsigned long ip, bool *pfmemalloc)
 134{
 135	void *obj;
 136	bool ret_pfmemalloc = false;
 137
 138	/*
 139	 * Try a regular allocation, when that fails and we're not entitled
 140	 * to the reserves, fail.
 141	 */
 142	obj = kmalloc_node_track_caller(size,
 143					flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
 144					node);
 145	if (obj || !(gfp_pfmemalloc_allowed(flags)))
 146		goto out;
 147
 148	/* Try again but now we are using pfmemalloc reserves */
 149	ret_pfmemalloc = true;
 150	obj = kmalloc_node_track_caller(size, flags, node);
 151
 152out:
 153	if (pfmemalloc)
 154		*pfmemalloc = ret_pfmemalloc;
 155
 156	return obj;
 157}
 158
 159/* 	Allocate a new skbuff. We do this ourselves so we can fill in a few
 160 *	'private' fields and also do memory statistics to find all the
 161 *	[BEEP] leaks.
 162 *
 163 */
 164
 165/**
 166 *	__alloc_skb	-	allocate a network buffer
 167 *	@size: size to allocate
 168 *	@gfp_mask: allocation mask
 169 *	@flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache
 170 *		instead of head cache and allocate a cloned (child) skb.
 171 *		If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for
 172 *		allocations in case the data is required for writeback
 173 *	@node: numa node to allocate memory on
 174 *
 175 *	Allocate a new &sk_buff. The returned buffer has no headroom and a
 176 *	tail room of at least size bytes. The object has a reference count
 177 *	of one. The return is the buffer. On a failure the return is %NULL.
 178 *
 179 *	Buffers may only be allocated from interrupts using a @gfp_mask of
 180 *	%GFP_ATOMIC.
 181 */
 182struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
 183			    int flags, int node)
 184{
 185	struct kmem_cache *cache;
 186	struct skb_shared_info *shinfo;
 187	struct sk_buff *skb;
 188	u8 *data;
 189	bool pfmemalloc;
 190
 191	cache = (flags & SKB_ALLOC_FCLONE)
 192		? skbuff_fclone_cache : skbuff_head_cache;
 193
 194	if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX))
 195		gfp_mask |= __GFP_MEMALLOC;
 196
 197	/* Get the HEAD */
 198	skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node);
 199	if (!skb)
 200		goto out;
 201	prefetchw(skb);
 202
 203	/* We do our best to align skb_shared_info on a separate cache
 204	 * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives
 205	 * aligned memory blocks, unless SLUB/SLAB debug is enabled.
 206	 * Both skb->head and skb_shared_info are cache line aligned.
 207	 */
 208	size = SKB_DATA_ALIGN(size);
 209	size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 210	data = kmalloc_reserve(size, gfp_mask, node, &pfmemalloc);
 211	if (!data)
 212		goto nodata;
 213	/* kmalloc(size) might give us more room than requested.
 214	 * Put skb_shared_info exactly at the end of allocated zone,
 215	 * to allow max possible filling before reallocation.
 216	 */
 217	size = SKB_WITH_OVERHEAD(ksize(data));
 218	prefetchw(data + size);
 219
 220	/*
 221	 * Only clear those fields we need to clear, not those that we will
 222	 * actually initialise below. Hence, don't put any more fields after
 223	 * the tail pointer in struct sk_buff!
 224	 */
 225	memset(skb, 0, offsetof(struct sk_buff, tail));
 226	/* Account for allocated memory : skb + skb->head */
 227	skb->truesize = SKB_TRUESIZE(size);
 228	skb->pfmemalloc = pfmemalloc;
 229	refcount_set(&skb->users, 1);
 230	skb->head = data;
 231	skb->data = data;
 232	skb_reset_tail_pointer(skb);
 233	skb->end = skb->tail + size;
 234	skb->mac_header = (typeof(skb->mac_header))~0U;
 235	skb->transport_header = (typeof(skb->transport_header))~0U;
 236
 237	/* make sure we initialize shinfo sequentially */
 238	shinfo = skb_shinfo(skb);
 239	memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
 240	atomic_set(&shinfo->dataref, 1);
 241
 242	if (flags & SKB_ALLOC_FCLONE) {
 243		struct sk_buff_fclones *fclones;
 244
 245		fclones = container_of(skb, struct sk_buff_fclones, skb1);
 246
 247		skb->fclone = SKB_FCLONE_ORIG;
 248		refcount_set(&fclones->fclone_ref, 1);
 249
 250		fclones->skb2.fclone = SKB_FCLONE_CLONE;
 251	}
 252
 253	skb_set_kcov_handle(skb, kcov_common_handle());
 254
 255out:
 256	return skb;
 257nodata:
 258	kmem_cache_free(cache, skb);
 259	skb = NULL;
 260	goto out;
 261}
 262EXPORT_SYMBOL(__alloc_skb);
 263
 264/* Caller must provide SKB that is memset cleared */
 265static struct sk_buff *__build_skb_around(struct sk_buff *skb,
 266					  void *data, unsigned int frag_size)
 267{
 268	struct skb_shared_info *shinfo;
 269	unsigned int size = frag_size ? : ksize(data);
 270
 271	size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 272
 273	/* Assumes caller memset cleared SKB */
 274	skb->truesize = SKB_TRUESIZE(size);
 275	refcount_set(&skb->users, 1);
 276	skb->head = data;
 277	skb->data = data;
 278	skb_reset_tail_pointer(skb);
 279	skb->end = skb->tail + size;
 280	skb->mac_header = (typeof(skb->mac_header))~0U;
 281	skb->transport_header = (typeof(skb->transport_header))~0U;
 282
 283	/* make sure we initialize shinfo sequentially */
 284	shinfo = skb_shinfo(skb);
 285	memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
 286	atomic_set(&shinfo->dataref, 1);
 287
 288	skb_set_kcov_handle(skb, kcov_common_handle());
 289
 290	return skb;
 291}
 292
 293/**
 294 * __build_skb - build a network buffer
 295 * @data: data buffer provided by caller
 296 * @frag_size: size of data, or 0 if head was kmalloced
 297 *
 298 * Allocate a new &sk_buff. Caller provides space holding head and
 299 * skb_shared_info. @data must have been allocated by kmalloc() only if
 300 * @frag_size is 0, otherwise data should come from the page allocator
 301 *  or vmalloc()
 302 * The return is the new skb buffer.
 303 * On a failure the return is %NULL, and @data is not freed.
 304 * Notes :
 305 *  Before IO, driver allocates only data buffer where NIC put incoming frame
 306 *  Driver should add room at head (NET_SKB_PAD) and
 307 *  MUST add room at tail (SKB_DATA_ALIGN(skb_shared_info))
 308 *  After IO, driver calls build_skb(), to allocate sk_buff and populate it
 309 *  before giving packet to stack.
 310 *  RX rings only contains data buffers, not full skbs.
 311 */
 312struct sk_buff *__build_skb(void *data, unsigned int frag_size)
 313{
 314	struct sk_buff *skb;
 315
 316	skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC);
 317	if (unlikely(!skb))
 318		return NULL;
 319
 320	memset(skb, 0, offsetof(struct sk_buff, tail));
 321
 322	return __build_skb_around(skb, data, frag_size);
 323}
 324
 325/* build_skb() is wrapper over __build_skb(), that specifically
 326 * takes care of skb->head and skb->pfmemalloc
 327 * This means that if @frag_size is not zero, then @data must be backed
 328 * by a page fragment, not kmalloc() or vmalloc()
 329 */
 330struct sk_buff *build_skb(void *data, unsigned int frag_size)
 331{
 332	struct sk_buff *skb = __build_skb(data, frag_size);
 333
 334	if (skb && frag_size) {
 335		skb->head_frag = 1;
 336		if (page_is_pfmemalloc(virt_to_head_page(data)))
 337			skb->pfmemalloc = 1;
 338	}
 339	return skb;
 340}
 341EXPORT_SYMBOL(build_skb);
 342
 343/**
 344 * build_skb_around - build a network buffer around provided skb
 345 * @skb: sk_buff provide by caller, must be memset cleared
 346 * @data: data buffer provided by caller
 347 * @frag_size: size of data, or 0 if head was kmalloced
 348 */
 349struct sk_buff *build_skb_around(struct sk_buff *skb,
 350				 void *data, unsigned int frag_size)
 351{
 352	if (unlikely(!skb))
 353		return NULL;
 354
 355	skb = __build_skb_around(skb, data, frag_size);
 356
 357	if (skb && frag_size) {
 358		skb->head_frag = 1;
 359		if (page_is_pfmemalloc(virt_to_head_page(data)))
 360			skb->pfmemalloc = 1;
 361	}
 362	return skb;
 363}
 364EXPORT_SYMBOL(build_skb_around);
 365
 366#define NAPI_SKB_CACHE_SIZE	64
 367
 368struct napi_alloc_cache {
 369	struct page_frag_cache page;
 370	unsigned int skb_count;
 371	void *skb_cache[NAPI_SKB_CACHE_SIZE];
 372};
 373
 374static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
 375static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
 376
 377static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
 378{
 379	struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
 380
 381	return page_frag_alloc(&nc->page, fragsz, gfp_mask);
 382}
 383
 384void *napi_alloc_frag(unsigned int fragsz)
 385{
 386	fragsz = SKB_DATA_ALIGN(fragsz);
 387
 388	return __napi_alloc_frag(fragsz, GFP_ATOMIC);
 389}
 390EXPORT_SYMBOL(napi_alloc_frag);
 391
 392/**
 393 * netdev_alloc_frag - allocate a page fragment
 394 * @fragsz: fragment size
 395 *
 396 * Allocates a frag from a page for receive buffer.
 397 * Uses GFP_ATOMIC allocations.
 398 */
 399void *netdev_alloc_frag(unsigned int fragsz)
 400{
 401	struct page_frag_cache *nc;
 402	void *data;
 403
 404	fragsz = SKB_DATA_ALIGN(fragsz);
 405	if (in_irq() || irqs_disabled()) {
 406		nc = this_cpu_ptr(&netdev_alloc_cache);
 407		data = page_frag_alloc(nc, fragsz, GFP_ATOMIC);
 408	} else {
 409		local_bh_disable();
 410		data = __napi_alloc_frag(fragsz, GFP_ATOMIC);
 411		local_bh_enable();
 412	}
 413	return data;
 414}
 415EXPORT_SYMBOL(netdev_alloc_frag);
 416
 417/**
 418 *	__netdev_alloc_skb - allocate an skbuff for rx on a specific device
 419 *	@dev: network device to receive on
 420 *	@len: length to allocate
 421 *	@gfp_mask: get_free_pages mask, passed to alloc_skb
 422 *
 423 *	Allocate a new &sk_buff and assign it a usage count of one. The
 424 *	buffer has NET_SKB_PAD headroom built in. Users should allocate
 425 *	the headroom they think they need without accounting for the
 426 *	built in space. The built in space is used for optimisations.
 427 *
 428 *	%NULL is returned if there is no free memory.
 429 */
 430struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
 431				   gfp_t gfp_mask)
 432{
 433	struct page_frag_cache *nc;
 434	struct sk_buff *skb;
 435	bool pfmemalloc;
 436	void *data;
 437
 438	len += NET_SKB_PAD;
 439
 440	/* If requested length is either too small or too big,
 441	 * we use kmalloc() for skb->head allocation.
 442	 */
 443	if (len <= SKB_WITH_OVERHEAD(1024) ||
 444	    len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
 445	    (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
 446		skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
 447		if (!skb)
 448			goto skb_fail;
 449		goto skb_success;
 450	}
 451
 452	len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 453	len = SKB_DATA_ALIGN(len);
 454
 455	if (sk_memalloc_socks())
 456		gfp_mask |= __GFP_MEMALLOC;
 457
 458	if (in_irq() || irqs_disabled()) {
 459		nc = this_cpu_ptr(&netdev_alloc_cache);
 460		data = page_frag_alloc(nc, len, gfp_mask);
 461		pfmemalloc = nc->pfmemalloc;
 462	} else {
 463		local_bh_disable();
 464		nc = this_cpu_ptr(&napi_alloc_cache.page);
 465		data = page_frag_alloc(nc, len, gfp_mask);
 466		pfmemalloc = nc->pfmemalloc;
 467		local_bh_enable();
 468	}
 469
 470	if (unlikely(!data))
 471		return NULL;
 472
 473	skb = __build_skb(data, len);
 474	if (unlikely(!skb)) {
 475		skb_free_frag(data);
 476		return NULL;
 477	}
 478
 479	if (pfmemalloc)
 480		skb->pfmemalloc = 1;
 481	skb->head_frag = 1;
 482
 483skb_success:
 484	skb_reserve(skb, NET_SKB_PAD);
 485	skb->dev = dev;
 486
 487skb_fail:
 488	return skb;
 489}
 490EXPORT_SYMBOL(__netdev_alloc_skb);
 491
 492/**
 493 *	__napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance
 494 *	@napi: napi instance this buffer was allocated for
 495 *	@len: length to allocate
 496 *	@gfp_mask: get_free_pages mask, passed to alloc_skb and alloc_pages
 497 *
 498 *	Allocate a new sk_buff for use in NAPI receive.  This buffer will
 499 *	attempt to allocate the head from a special reserved region used
 500 *	only for NAPI Rx allocation.  By doing this we can save several
 501 *	CPU cycles by avoiding having to disable and re-enable IRQs.
 502 *
 503 *	%NULL is returned if there is no free memory.
 504 */
 505struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
 506				 gfp_t gfp_mask)
 507{
 508	struct napi_alloc_cache *nc;
 509	struct sk_buff *skb;
 510	void *data;
 511
 512	len += NET_SKB_PAD + NET_IP_ALIGN;
 513
 514	/* If requested length is either too small or too big,
 515	 * we use kmalloc() for skb->head allocation.
 516	 */
 517	if (len <= SKB_WITH_OVERHEAD(1024) ||
 518	    len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
 519	    (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
 520		skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
 521		if (!skb)
 522			goto skb_fail;
 523		goto skb_success;
 524	}
 525
 526	nc = this_cpu_ptr(&napi_alloc_cache);
 527	len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 528	len = SKB_DATA_ALIGN(len);
 529
 530	if (sk_memalloc_socks())
 531		gfp_mask |= __GFP_MEMALLOC;
 532
 533	data = page_frag_alloc(&nc->page, len, gfp_mask);
 534	if (unlikely(!data))
 535		return NULL;
 536
 537	skb = __build_skb(data, len);
 538	if (unlikely(!skb)) {
 539		skb_free_frag(data);
 540		return NULL;
 541	}
 542
 543	if (nc->page.pfmemalloc)
 544		skb->pfmemalloc = 1;
 545	skb->head_frag = 1;
 546
 547skb_success:
 548	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
 549	skb->dev = napi->dev;
 550
 551skb_fail:
 552	return skb;
 553}
 554EXPORT_SYMBOL(__napi_alloc_skb);
 555
 556void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
 557		     int size, unsigned int truesize)
 558{
 559	skb_fill_page_desc(skb, i, page, off, size);
 560	skb->len += size;
 561	skb->data_len += size;
 562	skb->truesize += truesize;
 563}
 564EXPORT_SYMBOL(skb_add_rx_frag);
 565
 566void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size,
 567			  unsigned int truesize)
 568{
 569	skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
 570
 571	skb_frag_size_add(frag, size);
 572	skb->len += size;
 573	skb->data_len += size;
 574	skb->truesize += truesize;
 575}
 576EXPORT_SYMBOL(skb_coalesce_rx_frag);
 577
 578static void skb_drop_list(struct sk_buff **listp)
 579{
 580	kfree_skb_list(*listp);
 581	*listp = NULL;
 582}
 583
 584static inline void skb_drop_fraglist(struct sk_buff *skb)
 585{
 586	skb_drop_list(&skb_shinfo(skb)->frag_list);
 587}
 588
 589static void skb_clone_fraglist(struct sk_buff *skb)
 590{
 591	struct sk_buff *list;
 592
 593	skb_walk_frags(skb, list)
 594		skb_get(list);
 595}
 596
 597static void skb_free_head(struct sk_buff *skb)
 598{
 599	unsigned char *head = skb->head;
 600
 601	if (skb->head_frag)
 602		skb_free_frag(head);
 603	else
 604		kfree(head);
 605}
 606
 607static void skb_release_data(struct sk_buff *skb)
 608{
 609	struct skb_shared_info *shinfo = skb_shinfo(skb);
 610	int i;
 611
 612	if (skb->cloned &&
 613	    atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
 614			      &shinfo->dataref))
 615		return;
 616
 617	for (i = 0; i < shinfo->nr_frags; i++)
 618		__skb_frag_unref(&shinfo->frags[i]);
 619
 620	if (shinfo->frag_list)
 621		kfree_skb_list(shinfo->frag_list);
 622
 623	skb_zcopy_clear(skb, true);
 624	skb_free_head(skb);
 625}
 626
 627/*
 628 *	Free an skbuff by memory without cleaning the state.
 629 */
 630static void kfree_skbmem(struct sk_buff *skb)
 631{
 632	struct sk_buff_fclones *fclones;
 633
 634	switch (skb->fclone) {
 635	case SKB_FCLONE_UNAVAILABLE:
 636		kmem_cache_free(skbuff_head_cache, skb);
 637		return;
 638
 639	case SKB_FCLONE_ORIG:
 640		fclones = container_of(skb, struct sk_buff_fclones, skb1);
 641
 642		/* We usually free the clone (TX completion) before original skb
 643		 * This test would have no chance to be true for the clone,
 644		 * while here, branch prediction will be good.
 645		 */
 646		if (refcount_read(&fclones->fclone_ref) == 1)
 647			goto fastpath;
 648		break;
 649
 650	default: /* SKB_FCLONE_CLONE */
 651		fclones = container_of(skb, struct sk_buff_fclones, skb2);
 652		break;
 653	}
 654	if (!refcount_dec_and_test(&fclones->fclone_ref))
 655		return;
 656fastpath:
 657	kmem_cache_free(skbuff_fclone_cache, fclones);
 658}
 659
 660void skb_release_head_state(struct sk_buff *skb)
 661{
 662	skb_dst_drop(skb);
 663	if (skb->destructor) {
 664		WARN_ON(in_irq());
 665		skb->destructor(skb);
 666	}
 667#if IS_ENABLED(CONFIG_NF_CONNTRACK)
 668	nf_conntrack_put(skb_nfct(skb));
 669#endif
 670	skb_ext_put(skb);
 671}
 672
 673/* Free everything but the sk_buff shell. */
 674static void skb_release_all(struct sk_buff *skb)
 675{
 676	skb_release_head_state(skb);
 677	if (likely(skb->head))
 678		skb_release_data(skb);
 679}
 680
 681/**
 682 *	__kfree_skb - private function
 683 *	@skb: buffer
 684 *
 685 *	Free an sk_buff. Release anything attached to the buffer.
 686 *	Clean the state. This is an internal helper function. Users should
 687 *	always call kfree_skb
 688 */
 689
 690void __kfree_skb(struct sk_buff *skb)
 691{
 692	skb_release_all(skb);
 693	kfree_skbmem(skb);
 694}
 695EXPORT_SYMBOL(__kfree_skb);
 696
 697/**
 698 *	kfree_skb - free an sk_buff
 699 *	@skb: buffer to free
 700 *
 701 *	Drop a reference to the buffer and free it if the usage count has
 702 *	hit zero.
 703 */
 704void kfree_skb(struct sk_buff *skb)
 705{
 706	if (!skb_unref(skb))
 707		return;
 708
 709	trace_kfree_skb(skb, __builtin_return_address(0));
 710	__kfree_skb(skb);
 711}
 712EXPORT_SYMBOL(kfree_skb);
 713
 714void kfree_skb_list(struct sk_buff *segs)
 715{
 716	while (segs) {
 717		struct sk_buff *next = segs->next;
 718
 719		kfree_skb(segs);
 720		segs = next;
 721	}
 722}
 723EXPORT_SYMBOL(kfree_skb_list);
 724
 725/* Dump skb information and contents.
 726 *
 727 * Must only be called from net_ratelimit()-ed paths.
 728 *
 729 * Dumps whole packets if full_pkt, only headers otherwise.
 730 */
 731void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt)
 732{
 733	struct skb_shared_info *sh = skb_shinfo(skb);
 734	struct net_device *dev = skb->dev;
 735	struct sock *sk = skb->sk;
 736	struct sk_buff *list_skb;
 737	bool has_mac, has_trans;
 738	int headroom, tailroom;
 739	int i, len, seg_len;
 740
 741	if (full_pkt)
 742		len = skb->len;
 743	else
 744		len = min_t(int, skb->len, MAX_HEADER + 128);
 745
 746	headroom = skb_headroom(skb);
 747	tailroom = skb_tailroom(skb);
 748
 749	has_mac = skb_mac_header_was_set(skb);
 750	has_trans = skb_transport_header_was_set(skb);
 751
 752	printk("%sskb len=%u headroom=%u headlen=%u tailroom=%u\n"
 753	       "mac=(%d,%d) net=(%d,%d) trans=%d\n"
 754	       "shinfo(txflags=%u nr_frags=%u gso(size=%hu type=%u segs=%hu))\n"
 755	       "csum(0x%x ip_summed=%u complete_sw=%u valid=%u level=%u)\n"
 756	       "hash(0x%x sw=%u l4=%u) proto=0x%04x pkttype=%u iif=%d\n",
 757	       level, skb->len, headroom, skb_headlen(skb), tailroom,
 758	       has_mac ? skb->mac_header : -1,
 759	       has_mac ? skb_mac_header_len(skb) : -1,
 760	       skb->network_header,
 761	       has_trans ? skb_network_header_len(skb) : -1,
 762	       has_trans ? skb->transport_header : -1,
 763	       sh->tx_flags, sh->nr_frags,
 764	       sh->gso_size, sh->gso_type, sh->gso_segs,
 765	       skb->csum, skb->ip_summed, skb->csum_complete_sw,
 766	       skb->csum_valid, skb->csum_level,
 767	       skb->hash, skb->sw_hash, skb->l4_hash,
 768	       ntohs(skb->protocol), skb->pkt_type, skb->skb_iif);
 769
 770	if (dev)
 771		printk("%sdev name=%s feat=0x%pNF\n",
 772		       level, dev->name, &dev->features);
 773	if (sk)
 774		printk("%ssk family=%hu type=%u proto=%u\n",
 775		       level, sk->sk_family, sk->sk_type, sk->sk_protocol);
 776
 777	if (full_pkt && headroom)
 778		print_hex_dump(level, "skb headroom: ", DUMP_PREFIX_OFFSET,
 779			       16, 1, skb->head, headroom, false);
 780
 781	seg_len = min_t(int, skb_headlen(skb), len);
 782	if (seg_len)
 783		print_hex_dump(level, "skb linear:   ", DUMP_PREFIX_OFFSET,
 784			       16, 1, skb->data, seg_len, false);
 785	len -= seg_len;
 786
 787	if (full_pkt && tailroom)
 788		print_hex_dump(level, "skb tailroom: ", DUMP_PREFIX_OFFSET,
 789			       16, 1, skb_tail_pointer(skb), tailroom, false);
 790
 791	for (i = 0; len && i < skb_shinfo(skb)->nr_frags; i++) {
 792		skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
 793		u32 p_off, p_len, copied;
 794		struct page *p;
 795		u8 *vaddr;
 796
 797		skb_frag_foreach_page(frag, skb_frag_off(frag),
 798				      skb_frag_size(frag), p, p_off, p_len,
 799				      copied) {
 800			seg_len = min_t(int, p_len, len);
 801			vaddr = kmap_atomic(p);
 802			print_hex_dump(level, "skb frag:     ",
 803				       DUMP_PREFIX_OFFSET,
 804				       16, 1, vaddr + p_off, seg_len, false);
 805			kunmap_atomic(vaddr);
 806			len -= seg_len;
 807			if (!len)
 808				break;
 809		}
 810	}
 811
 812	if (full_pkt && skb_has_frag_list(skb)) {
 813		printk("skb fraglist:\n");
 814		skb_walk_frags(skb, list_skb)
 815			skb_dump(level, list_skb, true);
 816	}
 817}
 818EXPORT_SYMBOL(skb_dump);
 819
 820/**
 821 *	skb_tx_error - report an sk_buff xmit error
 822 *	@skb: buffer that triggered an error
 823 *
 824 *	Report xmit error if a device callback is tracking this skb.
 825 *	skb must be freed afterwards.
 826 */
 827void skb_tx_error(struct sk_buff *skb)
 828{
 829	skb_zcopy_clear(skb, true);
 830}
 831EXPORT_SYMBOL(skb_tx_error);
 832
 833#ifdef CONFIG_TRACEPOINTS
 834/**
 835 *	consume_skb - free an skbuff
 836 *	@skb: buffer to free
 837 *
 838 *	Drop a ref to the buffer and free it if the usage count has hit zero
 839 *	Functions identically to kfree_skb, but kfree_skb assumes that the frame
 840 *	is being dropped after a failure and notes that
 841 */
 842void consume_skb(struct sk_buff *skb)
 843{
 844	if (!skb_unref(skb))
 845		return;
 846
 847	trace_consume_skb(skb);
 848	__kfree_skb(skb);
 849}
 850EXPORT_SYMBOL(consume_skb);
 851#endif
 852
 853/**
 854 *	__consume_stateless_skb - free an skbuff, assuming it is stateless
 855 *	@skb: buffer to free
 856 *
 857 *	Alike consume_skb(), but this variant assumes that this is the last
 858 *	skb reference and all the head states have been already dropped
 859 */
 860void __consume_stateless_skb(struct sk_buff *skb)
 861{
 862	trace_consume_skb(skb);
 863	skb_release_data(skb);
 864	kfree_skbmem(skb);
 865}
 866
 867void __kfree_skb_flush(void)
 868{
 869	struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
 870
 871	/* flush skb_cache if containing objects */
 872	if (nc->skb_count) {
 873		kmem_cache_free_bulk(skbuff_head_cache, nc->skb_count,
 874				     nc->skb_cache);
 875		nc->skb_count = 0;
 876	}
 877}
 878
 879static inline void _kfree_skb_defer(struct sk_buff *skb)
 880{
 881	struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
 882
 883	/* drop skb->head and call any destructors for packet */
 884	skb_release_all(skb);
 885
 886	/* record skb to CPU local list */
 887	nc->skb_cache[nc->skb_count++] = skb;
 888
 889#ifdef CONFIG_SLUB
 890	/* SLUB writes into objects when freeing */
 891	prefetchw(skb);
 892#endif
 893
 894	/* flush skb_cache if it is filled */
 895	if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) {
 896		kmem_cache_free_bulk(skbuff_head_cache, NAPI_SKB_CACHE_SIZE,
 897				     nc->skb_cache);
 898		nc->skb_count = 0;
 899	}
 900}
 901void __kfree_skb_defer(struct sk_buff *skb)
 902{
 903	_kfree_skb_defer(skb);
 904}
 905
 906void napi_consume_skb(struct sk_buff *skb, int budget)
 907{
 908	/* Zero budget indicate non-NAPI context called us, like netpoll */
 909	if (unlikely(!budget)) {
 910		dev_consume_skb_any(skb);
 911		return;
 912	}
 913
 914	lockdep_assert_in_softirq();
 915
 916	if (!skb_unref(skb))
 917		return;
 918
 919	/* if reaching here SKB is ready to free */
 920	trace_consume_skb(skb);
 921
 922	/* if SKB is a clone, don't handle this case */
 923	if (skb->fclone != SKB_FCLONE_UNAVAILABLE) {
 924		__kfree_skb(skb);
 925		return;
 926	}
 927
 928	_kfree_skb_defer(skb);
 929}
 930EXPORT_SYMBOL(napi_consume_skb);
 931
 932/* Make sure a field is enclosed inside headers_start/headers_end section */
 933#define CHECK_SKB_FIELD(field) \
 934	BUILD_BUG_ON(offsetof(struct sk_buff, field) <		\
 935		     offsetof(struct sk_buff, headers_start));	\
 936	BUILD_BUG_ON(offsetof(struct sk_buff, field) >		\
 937		     offsetof(struct sk_buff, headers_end));	\
 938
 939static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
 940{
 941	new->tstamp		= old->tstamp;
 942	/* We do not copy old->sk */
 943	new->dev		= old->dev;
 944	memcpy(new->cb, old->cb, sizeof(old->cb));
 945	skb_dst_copy(new, old);
 946	__skb_ext_copy(new, old);
 947	__nf_copy(new, old, false);
 948
 949	/* Note : this field could be in headers_start/headers_end section
 950	 * It is not yet because we do not want to have a 16 bit hole
 951	 */
 952	new->queue_mapping = old->queue_mapping;
 953
 954	memcpy(&new->headers_start, &old->headers_start,
 955	       offsetof(struct sk_buff, headers_end) -
 956	       offsetof(struct sk_buff, headers_start));
 957	CHECK_SKB_FIELD(protocol);
 958	CHECK_SKB_FIELD(csum);
 959	CHECK_SKB_FIELD(hash);
 960	CHECK_SKB_FIELD(priority);
 961	CHECK_SKB_FIELD(skb_iif);
 962	CHECK_SKB_FIELD(vlan_proto);
 963	CHECK_SKB_FIELD(vlan_tci);
 964	CHECK_SKB_FIELD(transport_header);
 965	CHECK_SKB_FIELD(network_header);
 966	CHECK_SKB_FIELD(mac_header);
 967	CHECK_SKB_FIELD(inner_protocol);
 968	CHECK_SKB_FIELD(inner_transport_header);
 969	CHECK_SKB_FIELD(inner_network_header);
 970	CHECK_SKB_FIELD(inner_mac_header);
 971	CHECK_SKB_FIELD(mark);
 972#ifdef CONFIG_NETWORK_SECMARK
 973	CHECK_SKB_FIELD(secmark);
 974#endif
 975#ifdef CONFIG_NET_RX_BUSY_POLL
 976	CHECK_SKB_FIELD(napi_id);
 977#endif
 978#ifdef CONFIG_XPS
 979	CHECK_SKB_FIELD(sender_cpu);
 980#endif
 981#ifdef CONFIG_NET_SCHED
 982	CHECK_SKB_FIELD(tc_index);
 983#endif
 984
 985}
 986
 987/*
 988 * You should not add any new code to this function.  Add it to
 989 * __copy_skb_header above instead.
 990 */
 991static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
 992{
 993#define C(x) n->x = skb->x
 994
 995	n->next = n->prev = NULL;
 996	n->sk = NULL;
 997	__copy_skb_header(n, skb);
 998
 999	C(len);
1000	C(data_len);
1001	C(mac_len);
1002	n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len;
1003	n->cloned = 1;
1004	n->nohdr = 0;
1005	n->peeked = 0;
1006	C(pfmemalloc);
1007	n->destructor = NULL;
1008	C(tail);
1009	C(end);
1010	C(head);
1011	C(head_frag);
1012	C(data);
1013	C(truesize);
1014	refcount_set(&n->users, 1);
1015
1016	atomic_inc(&(skb_shinfo(skb)->dataref));
1017	skb->cloned = 1;
1018
1019	return n;
1020#undef C
1021}
1022
1023/**
1024 * alloc_skb_for_msg() - allocate sk_buff to wrap frag list forming a msg
1025 * @first: first sk_buff of the msg
1026 */
1027struct sk_buff *alloc_skb_for_msg(struct sk_buff *first)
1028{
1029	struct sk_buff *n;
1030
1031	n = alloc_skb(0, GFP_ATOMIC);
1032	if (!n)
1033		return NULL;
1034
1035	n->len = first->len;
1036	n->data_len = first->len;
1037	n->truesize = first->truesize;
1038
1039	skb_shinfo(n)->frag_list = first;
1040
1041	__copy_skb_header(n, first);
1042	n->destructor = NULL;
1043
1044	return n;
1045}
1046EXPORT_SYMBOL_GPL(alloc_skb_for_msg);
1047
1048/**
1049 *	skb_morph	-	morph one skb into another
1050 *	@dst: the skb to receive the contents
1051 *	@src: the skb to supply the contents
1052 *
1053 *	This is identical to skb_clone except that the target skb is
1054 *	supplied by the user.
1055 *
1056 *	The target skb is returned upon exit.
1057 */
1058struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
1059{
1060	skb_release_all(dst);
1061	return __skb_clone(dst, src);
1062}
1063EXPORT_SYMBOL_GPL(skb_morph);
1064
1065int mm_account_pinned_pages(struct mmpin *mmp, size_t size)
1066{
1067	unsigned long max_pg, num_pg, new_pg, old_pg;
1068	struct user_struct *user;
1069
1070	if (capable(CAP_IPC_LOCK) || !size)
1071		return 0;
1072
1073	num_pg = (size >> PAGE_SHIFT) + 2;	/* worst case */
1074	max_pg = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
1075	user = mmp->user ? : current_user();
1076
1077	do {
1078		old_pg = atomic_long_read(&user->locked_vm);
1079		new_pg = old_pg + num_pg;
1080		if (new_pg > max_pg)
1081			return -ENOBUFS;
1082	} while (atomic_long_cmpxchg(&user->locked_vm, old_pg, new_pg) !=
1083		 old_pg);
1084
1085	if (!mmp->user) {
1086		mmp->user = get_uid(user);
1087		mmp->num_pg = num_pg;
1088	} else {
1089		mmp->num_pg += num_pg;
1090	}
1091
1092	return 0;
1093}
1094EXPORT_SYMBOL_GPL(mm_account_pinned_pages);
1095
1096void mm_unaccount_pinned_pages(struct mmpin *mmp)
1097{
1098	if (mmp->user) {
1099		atomic_long_sub(mmp->num_pg, &mmp->user->locked_vm);
1100		free_uid(mmp->user);
1101	}
1102}
1103EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages);
1104
1105struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size)
1106{
1107	struct ubuf_info *uarg;
1108	struct sk_buff *skb;
1109
1110	WARN_ON_ONCE(!in_task());
1111
1112	skb = sock_omalloc(sk, 0, GFP_KERNEL);
1113	if (!skb)
1114		return NULL;
1115
1116	BUILD_BUG_ON(sizeof(*uarg) > sizeof(skb->cb));
1117	uarg = (void *)skb->cb;
1118	uarg->mmp.user = NULL;
1119
1120	if (mm_account_pinned_pages(&uarg->mmp, size)) {
1121		kfree_skb(skb);
1122		return NULL;
1123	}
1124
1125	uarg->callback = sock_zerocopy_callback;
1126	uarg->id = ((u32)atomic_inc_return(&sk->sk_zckey)) - 1;
1127	uarg->len = 1;
1128	uarg->bytelen = size;
1129	uarg->zerocopy = 1;
1130	refcount_set(&uarg->refcnt, 1);
1131	sock_hold(sk);
1132
1133	return uarg;
1134}
1135EXPORT_SYMBOL_GPL(sock_zerocopy_alloc);
1136
1137static inline struct sk_buff *skb_from_uarg(struct ubuf_info *uarg)
1138{
1139	return container_of((void *)uarg, struct sk_buff, cb);
1140}
1141
1142struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size,
1143					struct ubuf_info *uarg)
1144{
1145	if (uarg) {
1146		const u32 byte_limit = 1 << 19;		/* limit to a few TSO */
1147		u32 bytelen, next;
1148
1149		/* realloc only when socket is locked (TCP, UDP cork),
1150		 * so uarg->len and sk_zckey access is serialized
1151		 */
1152		if (!sock_owned_by_user(sk)) {
1153			WARN_ON_ONCE(1);
1154			return NULL;
1155		}
1156
1157		bytelen = uarg->bytelen + size;
1158		if (uarg->len == USHRT_MAX - 1 || bytelen > byte_limit) {
1159			/* TCP can create new skb to attach new uarg */
1160			if (sk->sk_type == SOCK_STREAM)
1161				goto new_alloc;
1162			return NULL;
1163		}
1164
1165		next = (u32)atomic_read(&sk->sk_zckey);
1166		if ((u32)(uarg->id + uarg->len) == next) {
1167			if (mm_account_pinned_pages(&uarg->mmp, size))
1168				return NULL;
1169			uarg->len++;
1170			uarg->bytelen = bytelen;
1171			atomic_set(&sk->sk_zckey, ++next);
1172
1173			/* no extra ref when appending to datagram (MSG_MORE) */
1174			if (sk->sk_type == SOCK_STREAM)
1175				sock_zerocopy_get(uarg);
1176
1177			return uarg;
1178		}
1179	}
1180
1181new_alloc:
1182	return sock_zerocopy_alloc(sk, size);
1183}
1184EXPORT_SYMBOL_GPL(sock_zerocopy_realloc);
1185
1186static bool skb_zerocopy_notify_extend(struct sk_buff *skb, u32 lo, u16 len)
1187{
1188	struct sock_exterr_skb *serr = SKB_EXT_ERR(skb);
1189	u32 old_lo, old_hi;
1190	u64 sum_len;
1191
1192	old_lo = serr->ee.ee_info;
1193	old_hi = serr->ee.ee_data;
1194	sum_len = old_hi - old_lo + 1ULL + len;
1195
1196	if (sum_len >= (1ULL << 32))
1197		return false;
1198
1199	if (lo != old_hi + 1)
1200		return false;
1201
1202	serr->ee.ee_data += len;
1203	return true;
1204}
1205
1206void sock_zerocopy_callback(struct ubuf_info *uarg, bool success)
1207{
1208	struct sk_buff *tail, *skb = skb_from_uarg(uarg);
1209	struct sock_exterr_skb *serr;
1210	struct sock *sk = skb->sk;
1211	struct sk_buff_head *q;
1212	unsigned long flags;
1213	u32 lo, hi;
1214	u16 len;
1215
1216	mm_unaccount_pinned_pages(&uarg->mmp);
1217
1218	/* if !len, there was only 1 call, and it was aborted
1219	 * so do not queue a completion notification
1220	 */
1221	if (!uarg->len || sock_flag(sk, SOCK_DEAD))
1222		goto release;
1223
1224	len = uarg->len;
1225	lo = uarg->id;
1226	hi = uarg->id + len - 1;
1227
1228	serr = SKB_EXT_ERR(skb);
1229	memset(serr, 0, sizeof(*serr));
1230	serr->ee.ee_errno = 0;
1231	serr->ee.ee_origin = SO_EE_ORIGIN_ZEROCOPY;
1232	serr->ee.ee_data = hi;
1233	serr->ee.ee_info = lo;
1234	if (!success)
1235		serr->ee.ee_code |= SO_EE_CODE_ZEROCOPY_COPIED;
1236
1237	q = &sk->sk_error_queue;
1238	spin_lock_irqsave(&q->lock, flags);
1239	tail = skb_peek_tail(q);
1240	if (!tail || SKB_EXT_ERR(tail)->ee.ee_origin != SO_EE_ORIGIN_ZEROCOPY ||
1241	    !skb_zerocopy_notify_extend(tail, lo, len)) {
1242		__skb_queue_tail(q, skb);
1243		skb = NULL;
1244	}
1245	spin_unlock_irqrestore(&q->lock, flags);
1246
1247	sk->sk_error_report(sk);
1248
1249release:
1250	consume_skb(skb);
1251	sock_put(sk);
1252}
1253EXPORT_SYMBOL_GPL(sock_zerocopy_callback);
1254
1255void sock_zerocopy_put(struct ubuf_info *uarg)
1256{
1257	if (uarg && refcount_dec_and_test(&uarg->refcnt)) {
1258		if (uarg->callback)
1259			uarg->callback(uarg, uarg->zerocopy);
1260		else
1261			consume_skb(skb_from_uarg(uarg));
1262	}
1263}
1264EXPORT_SYMBOL_GPL(sock_zerocopy_put);
1265
1266void sock_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref)
1267{
1268	if (uarg) {
1269		struct sock *sk = skb_from_uarg(uarg)->sk;
1270
1271		atomic_dec(&sk->sk_zckey);
1272		uarg->len--;
1273
1274		if (have_uref)
1275			sock_zerocopy_put(uarg);
1276	}
1277}
1278EXPORT_SYMBOL_GPL(sock_zerocopy_put_abort);
1279
1280int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len)
1281{
1282	return __zerocopy_sg_from_iter(skb->sk, skb, &msg->msg_iter, len);
1283}
1284EXPORT_SYMBOL_GPL(skb_zerocopy_iter_dgram);
1285
1286int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
1287			     struct msghdr *msg, int len,
1288			     struct ubuf_info *uarg)
1289{
1290	struct ubuf_info *orig_uarg = skb_zcopy(skb);
1291	struct iov_iter orig_iter = msg->msg_iter;
1292	int err, orig_len = skb->len;
1293
1294	/* An skb can only point to one uarg. This edge case happens when
1295	 * TCP appends to an skb, but zerocopy_realloc triggered a new alloc.
1296	 */
1297	if (orig_uarg && uarg != orig_uarg)
1298		return -EEXIST;
1299
1300	err = __zerocopy_sg_from_iter(sk, skb, &msg->msg_iter, len);
1301	if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) {
1302		struct sock *save_sk = skb->sk;
1303
1304		/* Streams do not free skb on error. Reset to prev state. */
1305		msg->msg_iter = orig_iter;
1306		skb->sk = sk;
1307		___pskb_trim(skb, orig_len);
1308		skb->sk = save_sk;
1309		return err;
1310	}
1311
1312	skb_zcopy_set(skb, uarg, NULL);
1313	return skb->len - orig_len;
1314}
1315EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream);
1316
1317static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig,
1318			      gfp_t gfp_mask)
1319{
1320	if (skb_zcopy(orig)) {
1321		if (skb_zcopy(nskb)) {
1322			/* !gfp_mask callers are verified to !skb_zcopy(nskb) */
1323			if (!gfp_mask) {
1324				WARN_ON_ONCE(1);
1325				return -ENOMEM;
1326			}
1327			if (skb_uarg(nskb) == skb_uarg(orig))
1328				return 0;
1329			if (skb_copy_ubufs(nskb, GFP_ATOMIC))
1330				return -EIO;
1331		}
1332		skb_zcopy_set(nskb, skb_uarg(orig), NULL);
1333	}
1334	return 0;
1335}
1336
1337/**
1338 *	skb_copy_ubufs	-	copy userspace skb frags buffers to kernel
1339 *	@skb: the skb to modify
1340 *	@gfp_mask: allocation priority
1341 *
1342 *	This must be called on SKBTX_DEV_ZEROCOPY skb.
1343 *	It will copy all frags into kernel and drop the reference
1344 *	to userspace pages.
1345 *
1346 *	If this function is called from an interrupt gfp_mask() must be
1347 *	%GFP_ATOMIC.
1348 *
1349 *	Returns 0 on success or a negative error code on failure
1350 *	to allocate kernel memory to copy to.
1351 */
1352int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
1353{
1354	int num_frags = skb_shinfo(skb)->nr_frags;
1355	struct page *page, *head = NULL;
1356	int i, new_frags;
1357	u32 d_off;
1358
1359	if (skb_shared(skb) || skb_unclone(skb, gfp_mask))
1360		return -EINVAL;
1361
1362	if (!num_frags)
1363		goto release;
1364
1365	new_frags = (__skb_pagelen(skb) + PAGE_SIZE - 1) >> PAGE_SHIFT;
1366	for (i = 0; i < new_frags; i++) {
1367		page = alloc_page(gfp_mask);
1368		if (!page) {
1369			while (head) {
1370				struct page *next = (struct page *)page_private(head);
1371				put_page(head);
1372				head = next;
1373			}
1374			return -ENOMEM;
1375		}
1376		set_page_private(page, (unsigned long)head);
1377		head = page;
1378	}
1379
1380	page = head;
1381	d_off = 0;
1382	for (i = 0; i < num_frags; i++) {
1383		skb_frag_t *f = &skb_shinfo(skb)->frags[i];
1384		u32 p_off, p_len, copied;
1385		struct page *p;
1386		u8 *vaddr;
1387
1388		skb_frag_foreach_page(f, skb_frag_off(f), skb_frag_size(f),
1389				      p, p_off, p_len, copied) {
1390			u32 copy, done = 0;
1391			vaddr = kmap_atomic(p);
1392
1393			while (done < p_len) {
1394				if (d_off == PAGE_SIZE) {
1395					d_off = 0;
1396					page = (struct page *)page_private(page);
1397				}
1398				copy = min_t(u32, PAGE_SIZE - d_off, p_len - done);
1399				memcpy(page_address(page) + d_off,
1400				       vaddr + p_off + done, copy);
1401				done += copy;
1402				d_off += copy;
1403			}
1404			kunmap_atomic(vaddr);
1405		}
1406	}
1407
1408	/* skb frags release userspace buffers */
1409	for (i = 0; i < num_frags; i++)
1410		skb_frag_unref(skb, i);
1411
1412	/* skb frags point to kernel buffers */
1413	for (i = 0; i < new_frags - 1; i++) {
1414		__skb_fill_page_desc(skb, i, head, 0, PAGE_SIZE);
1415		head = (struct page *)page_private(head);
1416	}
1417	__skb_fill_page_desc(skb, new_frags - 1, head, 0, d_off);
1418	skb_shinfo(skb)->nr_frags = new_frags;
1419
1420release:
1421	skb_zcopy_clear(skb, false);
1422	return 0;
1423}
1424EXPORT_SYMBOL_GPL(skb_copy_ubufs);
1425
1426/**
1427 *	skb_clone	-	duplicate an sk_buff
1428 *	@skb: buffer to clone
1429 *	@gfp_mask: allocation priority
1430 *
1431 *	Duplicate an &sk_buff. The new one is not owned by a socket. Both
1432 *	copies share the same packet data but not structure. The new
1433 *	buffer has a reference count of 1. If the allocation fails the
1434 *	function returns %NULL otherwise the new buffer is returned.
1435 *
1436 *	If this function is called from an interrupt gfp_mask() must be
1437 *	%GFP_ATOMIC.
1438 */
1439
1440struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
1441{
1442	struct sk_buff_fclones *fclones = container_of(skb,
1443						       struct sk_buff_fclones,
1444						       skb1);
1445	struct sk_buff *n;
1446
1447	if (skb_orphan_frags(skb, gfp_mask))
1448		return NULL;
1449
1450	if (skb->fclone == SKB_FCLONE_ORIG &&
1451	    refcount_read(&fclones->fclone_ref) == 1) {
1452		n = &fclones->skb2;
1453		refcount_set(&fclones->fclone_ref, 2);
1454	} else {
1455		if (skb_pfmemalloc(skb))
1456			gfp_mask |= __GFP_MEMALLOC;
1457
1458		n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
1459		if (!n)
1460			return NULL;
1461
1462		n->fclone = SKB_FCLONE_UNAVAILABLE;
1463	}
1464
1465	return __skb_clone(n, skb);
1466}
1467EXPORT_SYMBOL(skb_clone);
1468
1469void skb_headers_offset_update(struct sk_buff *skb, int off)
1470{
1471	/* Only adjust this if it actually is csum_start rather than csum */
1472	if (skb->ip_summed == CHECKSUM_PARTIAL)
1473		skb->csum_start += off;
1474	/* {transport,network,mac}_header and tail are relative to skb->head */
1475	skb->transport_header += off;
1476	skb->network_header   += off;
1477	if (skb_mac_header_was_set(skb))
1478		skb->mac_header += off;
1479	skb->inner_transport_header += off;
1480	skb->inner_network_header += off;
1481	skb->inner_mac_header += off;
1482}
1483EXPORT_SYMBOL(skb_headers_offset_update);
1484
1485void skb_copy_header(struct sk_buff *new, const struct sk_buff *old)
1486{
1487	__copy_skb_header(new, old);
1488
1489	skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size;
1490	skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs;
1491	skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type;
1492}
1493EXPORT_SYMBOL(skb_copy_header);
1494
1495static inline int skb_alloc_rx_flag(const struct sk_buff *skb)
1496{
1497	if (skb_pfmemalloc(skb))
1498		return SKB_ALLOC_RX;
1499	return 0;
1500}
1501
1502/**
1503 *	skb_copy	-	create private copy of an sk_buff
1504 *	@skb: buffer to copy
1505 *	@gfp_mask: allocation priority
1506 *
1507 *	Make a copy of both an &sk_buff and its data. This is used when the
1508 *	caller wishes to modify the data and needs a private copy of the
1509 *	data to alter. Returns %NULL on failure or the pointer to the buffer
1510 *	on success. The returned buffer has a reference count of 1.
1511 *
1512 *	As by-product this function converts non-linear &sk_buff to linear
1513 *	one, so that &sk_buff becomes completely private and caller is allowed
1514 *	to modify all the data of returned buffer. This means that this
1515 *	function is not recommended for use in circumstances when only
1516 *	header is going to be modified. Use pskb_copy() instead.
1517 */
1518
1519struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
1520{
1521	int headerlen = skb_headroom(skb);
1522	unsigned int size = skb_end_offset(skb) + skb->data_len;
1523	struct sk_buff *n = __alloc_skb(size, gfp_mask,
1524					skb_alloc_rx_flag(skb), NUMA_NO_NODE);
1525
1526	if (!n)
1527		return NULL;
1528
1529	/* Set the data pointer */
1530	skb_reserve(n, headerlen);
1531	/* Set the tail pointer and length */
1532	skb_put(n, skb->len);
1533
1534	BUG_ON(skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len));
1535
1536	skb_copy_header(n, skb);
1537	return n;
1538}
1539EXPORT_SYMBOL(skb_copy);
1540
1541/**
1542 *	__pskb_copy_fclone	-  create copy of an sk_buff with private head.
1543 *	@skb: buffer to copy
1544 *	@headroom: headroom of new skb
1545 *	@gfp_mask: allocation priority
1546 *	@fclone: if true allocate the copy of the skb from the fclone
1547 *	cache instead of the head cache; it is recommended to set this
1548 *	to true for the cases where the copy will likely be cloned
1549 *
1550 *	Make a copy of both an &sk_buff and part of its data, located
1551 *	in header. Fragmented data remain shared. This is used when
1552 *	the caller wishes to modify only header of &sk_buff and needs
1553 *	private copy of the header to alter. Returns %NULL on failure
1554 *	or the pointer to the buffer on success.
1555 *	The returned buffer has a reference count of 1.
1556 */
1557
1558struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom,
1559				   gfp_t gfp_mask, bool fclone)
1560{
1561	unsigned int size = skb_headlen(skb) + headroom;
1562	int flags = skb_alloc_rx_flag(skb) | (fclone ? SKB_ALLOC_FCLONE : 0);
1563	struct sk_buff *n = __alloc_skb(size, gfp_mask, flags, NUMA_NO_NODE);
1564
1565	if (!n)
1566		goto out;
1567
1568	/* Set the data pointer */
1569	skb_reserve(n, headroom);
1570	/* Set the tail pointer and length */
1571	skb_put(n, skb_headlen(skb));
1572	/* Copy the bytes */
1573	skb_copy_from_linear_data(skb, n->data, n->len);
1574
1575	n->truesize += skb->data_len;
1576	n->data_len  = skb->data_len;
1577	n->len	     = skb->len;
1578
1579	if (skb_shinfo(skb)->nr_frags) {
1580		int i;
1581
1582		if (skb_orphan_frags(skb, gfp_mask) ||
1583		    skb_zerocopy_clone(n, skb, gfp_mask)) {
1584			kfree_skb(n);
1585			n = NULL;
1586			goto out;
1587		}
1588		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1589			skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i];
1590			skb_frag_ref(skb, i);
1591		}
1592		skb_shinfo(n)->nr_frags = i;
1593	}
1594
1595	if (skb_has_frag_list(skb)) {
1596		skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list;
1597		skb_clone_fraglist(n);
1598	}
1599
1600	skb_copy_header(n, skb);
1601out:
1602	return n;
1603}
1604EXPORT_SYMBOL(__pskb_copy_fclone);
1605
1606/**
1607 *	pskb_expand_head - reallocate header of &sk_buff
1608 *	@skb: buffer to reallocate
1609 *	@nhead: room to add at head
1610 *	@ntail: room to add at tail
1611 *	@gfp_mask: allocation priority
1612 *
1613 *	Expands (or creates identical copy, if @nhead and @ntail are zero)
1614 *	header of @skb. &sk_buff itself is not changed. &sk_buff MUST have
1615 *	reference count of 1. Returns zero in the case of success or error,
1616 *	if expansion failed. In the last case, &sk_buff is not changed.
1617 *
1618 *	All the pointers pointing into skb header may change and must be
1619 *	reloaded after call to this function.
1620 */
1621
1622int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
1623		     gfp_t gfp_mask)
1624{
1625	int i, osize = skb_end_offset(skb);
1626	int size = osize + nhead + ntail;
1627	long off;
1628	u8 *data;
1629
1630	BUG_ON(nhead < 0);
1631
1632	BUG_ON(skb_shared(skb));
1633
1634	size = SKB_DATA_ALIGN(size);
1635
1636	if (skb_pfmemalloc(skb))
1637		gfp_mask |= __GFP_MEMALLOC;
1638	data = kmalloc_reserve(size + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)),
1639			       gfp_mask, NUMA_NO_NODE, NULL);
1640	if (!data)
1641		goto nodata;
1642	size = SKB_WITH_OVERHEAD(ksize(data));
1643
1644	/* Copy only real data... and, alas, header. This should be
1645	 * optimized for the cases when header is void.
1646	 */
1647	memcpy(data + nhead, skb->head, skb_tail_pointer(skb) - skb->head);
1648
1649	memcpy((struct skb_shared_info *)(data + size),
1650	       skb_shinfo(skb),
1651	       offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags]));
1652
1653	/*
1654	 * if shinfo is shared we must drop the old head gracefully, but if it
1655	 * is not we can just drop the old head and let the existing refcount
1656	 * be since all we did is relocate the values
1657	 */
1658	if (skb_cloned(skb)) {
1659		if (skb_orphan_frags(skb, gfp_mask))
1660			goto nofrags;
1661		if (skb_zcopy(skb))
1662			refcount_inc(&skb_uarg(skb)->refcnt);
1663		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1664			skb_frag_ref(skb, i);
1665
1666		if (skb_has_frag_list(skb))
1667			skb_clone_fraglist(skb);
1668
1669		skb_release_data(skb);
1670	} else {
1671		skb_free_head(skb);
1672	}
1673	off = (data + nhead) - skb->head;
1674
1675	skb->head     = data;
1676	skb->head_frag = 0;
1677	skb->data    += off;
1678#ifdef NET_SKBUFF_DATA_USES_OFFSET
1679	skb->end      = size;
1680	off           = nhead;
1681#else
1682	skb->end      = skb->head + size;
1683#endif
1684	skb->tail	      += off;
1685	skb_headers_offset_update(skb, nhead);
1686	skb->cloned   = 0;
1687	skb->hdr_len  = 0;
1688	skb->nohdr    = 0;
1689	atomic_set(&skb_shinfo(skb)->dataref, 1);
1690
1691	skb_metadata_clear(skb);
1692
1693	/* It is not generally safe to change skb->truesize.
1694	 * For the moment, we really care of rx path, or
1695	 * when skb is orphaned (not attached to a socket).
1696	 */
1697	if (!skb->sk || skb->destructor == sock_edemux)
1698		skb->truesize += size - osize;
1699
1700	return 0;
1701
1702nofrags:
1703	kfree(data);
1704nodata:
1705	return -ENOMEM;
1706}
1707EXPORT_SYMBOL(pskb_expand_head);
1708
1709/* Make private copy of skb with writable head and some headroom */
1710
1711struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
1712{
1713	struct sk_buff *skb2;
1714	int delta = headroom - skb_headroom(skb);
1715
1716	if (delta <= 0)
1717		skb2 = pskb_copy(skb, GFP_ATOMIC);
1718	else {
1719		skb2 = skb_clone(skb, GFP_ATOMIC);
1720		if (skb2 && pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0,
1721					     GFP_ATOMIC)) {
1722			kfree_skb(skb2);
1723			skb2 = NULL;
1724		}
1725	}
1726	return skb2;
1727}
1728EXPORT_SYMBOL(skb_realloc_headroom);
1729
1730/**
1731 *	skb_copy_expand	-	copy and expand sk_buff
1732 *	@skb: buffer to copy
1733 *	@newheadroom: new free bytes at head
1734 *	@newtailroom: new free bytes at tail
1735 *	@gfp_mask: allocation priority
1736 *
1737 *	Make a copy of both an &sk_buff and its data and while doing so
1738 *	allocate additional space.
1739 *
1740 *	This is used when the caller wishes to modify the data and needs a
1741 *	private copy of the data to alter as well as more space for new fields.
1742 *	Returns %NULL on failure or the pointer to the buffer
1743 *	on success. The returned buffer has a reference count of 1.
1744 *
1745 *	You must pass %GFP_ATOMIC as the allocation priority if this function
1746 *	is called from an interrupt.
1747 */
1748struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
1749				int newheadroom, int newtailroom,
1750				gfp_t gfp_mask)
1751{
1752	/*
1753	 *	Allocate the copy buffer
1754	 */
1755	struct sk_buff *n = __alloc_skb(newheadroom + skb->len + newtailroom,
1756					gfp_mask, skb_alloc_rx_flag(skb),
1757					NUMA_NO_NODE);
1758	int oldheadroom = skb_headroom(skb);
1759	int head_copy_len, head_copy_off;
1760
1761	if (!n)
1762		return NULL;
1763
1764	skb_reserve(n, newheadroom);
1765
1766	/* Set the tail pointer and length */
1767	skb_put(n, skb->len);
1768
1769	head_copy_len = oldheadroom;
1770	head_copy_off = 0;
1771	if (newheadroom <= head_copy_len)
1772		head_copy_len = newheadroom;
1773	else
1774		head_copy_off = newheadroom - head_copy_len;
1775
1776	/* Copy the linear header and data. */
1777	BUG_ON(skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off,
1778			     skb->len + head_copy_len));
1779
1780	skb_copy_header(n, skb);
1781
1782	skb_headers_offset_update(n, newheadroom - oldheadroom);
1783
1784	return n;
1785}
1786EXPORT_SYMBOL(skb_copy_expand);
1787
1788/**
1789 *	__skb_pad		-	zero pad the tail of an skb
1790 *	@skb: buffer to pad
1791 *	@pad: space to pad
1792 *	@free_on_error: free buffer on error
1793 *
1794 *	Ensure that a buffer is followed by a padding area that is zero
1795 *	filled. Used by network drivers which may DMA or transfer data
1796 *	beyond the buffer end onto the wire.
1797 *
1798 *	May return error in out of memory cases. The skb is freed on error
1799 *	if @free_on_error is true.
1800 */
1801
1802int __skb_pad(struct sk_buff *skb, int pad, bool free_on_error)
1803{
1804	int err;
1805	int ntail;
1806
1807	/* If the skbuff is non linear tailroom is always zero.. */
1808	if (!skb_cloned(skb) && skb_tailroom(skb) >= pad) {
1809		memset(skb->data+skb->len, 0, pad);
1810		return 0;
1811	}
1812
1813	ntail = skb->data_len + pad - (skb->end - skb->tail);
1814	if (likely(skb_cloned(skb) || ntail > 0)) {
1815		err = pskb_expand_head(skb, 0, ntail, GFP_ATOMIC);
1816		if (unlikely(err))
1817			goto free_skb;
1818	}
1819
1820	/* FIXME: The use of this function with non-linear skb's really needs
1821	 * to be audited.
1822	 */
1823	err = skb_linearize(skb);
1824	if (unlikely(err))
1825		goto free_skb;
1826
1827	memset(skb->data + skb->len, 0, pad);
1828	return 0;
1829
1830free_skb:
1831	if (free_on_error)
1832		kfree_skb(skb);
1833	return err;
1834}
1835EXPORT_SYMBOL(__skb_pad);
1836
1837/**
1838 *	pskb_put - add data to the tail of a potentially fragmented buffer
1839 *	@skb: start of the buffer to use
1840 *	@tail: tail fragment of the buffer to use
1841 *	@len: amount of data to add
1842 *
1843 *	This function extends the used data area of the potentially
1844 *	fragmented buffer. @tail must be the last fragment of @skb -- or
1845 *	@skb itself. If this would exceed the total buffer size the kernel
1846 *	will panic. A pointer to the first byte of the extra data is
1847 *	returned.
1848 */
1849
1850void *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len)
1851{
1852	if (tail != skb) {
1853		skb->data_len += len;
1854		skb->len += len;
1855	}
1856	return skb_put(tail, len);
1857}
1858EXPORT_SYMBOL_GPL(pskb_put);
1859
1860/**
1861 *	skb_put - add data to a buffer
1862 *	@skb: buffer to use
1863 *	@len: amount of data to add
1864 *
1865 *	This function extends the used data area of the buffer. If this would
1866 *	exceed the total buffer size the kernel will panic. A pointer to the
1867 *	first byte of the extra data is returned.
1868 */
1869void *skb_put(struct sk_buff *skb, unsigned int len)
1870{
1871	void *tmp = skb_tail_pointer(skb);
1872	SKB_LINEAR_ASSERT(skb);
1873	skb->tail += len;
1874	skb->len  += len;
1875	if (unlikely(skb->tail > skb->end))
1876		skb_over_panic(skb, len, __builtin_return_address(0));
1877	return tmp;
1878}
1879EXPORT_SYMBOL(skb_put);
1880
1881/**
1882 *	skb_push - add data to the start of a buffer
1883 *	@skb: buffer to use
1884 *	@len: amount of data to add
1885 *
1886 *	This function extends the used data area of the buffer at the buffer
1887 *	start. If this would exceed the total buffer headroom the kernel will
1888 *	panic. A pointer to the first byte of the extra data is returned.
1889 */
1890void *skb_push(struct sk_buff *skb, unsigned int len)
1891{
1892	skb->data -= len;
1893	skb->len  += len;
1894	if (unlikely(skb->data < skb->head))
1895		skb_under_panic(skb, len, __builtin_return_address(0));
1896	return skb->data;
1897}
1898EXPORT_SYMBOL(skb_push);
1899
1900/**
1901 *	skb_pull - remove data from the start of a buffer
1902 *	@skb: buffer to use
1903 *	@len: amount of data to remove
1904 *
1905 *	This function removes data from the start of a buffer, returning
1906 *	the memory to the headroom. A pointer to the next data in the buffer
1907 *	is returned. Once the data has been pulled future pushes will overwrite
1908 *	the old data.
1909 */
1910void *skb_pull(struct sk_buff *skb, unsigned int len)
1911{
1912	return skb_pull_inline(skb, len);
1913}
1914EXPORT_SYMBOL(skb_pull);
1915
1916/**
1917 *	skb_trim - remove end from a buffer
1918 *	@skb: buffer to alter
1919 *	@len: new length
1920 *
1921 *	Cut the length of a buffer down by removing data from the tail. If
1922 *	the buffer is already under the length specified it is not modified.
1923 *	The skb must be linear.
1924 */
1925void skb_trim(struct sk_buff *skb, unsigned int len)
1926{
1927	if (skb->len > len)
1928		__skb_trim(skb, len);
1929}
1930EXPORT_SYMBOL(skb_trim);
1931
1932/* Trims skb to length len. It can change skb pointers.
1933 */
1934
1935int ___pskb_trim(struct sk_buff *skb, unsigned int len)
1936{
1937	struct sk_buff **fragp;
1938	struct sk_buff *frag;
1939	int offset = skb_headlen(skb);
1940	int nfrags = skb_shinfo(skb)->nr_frags;
1941	int i;
1942	int err;
1943
1944	if (skb_cloned(skb) &&
1945	    unlikely((err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))))
1946		return err;
1947
1948	i = 0;
1949	if (offset >= len)
1950		goto drop_pages;
1951
1952	for (; i < nfrags; i++) {
1953		int end = offset + skb_frag_size(&skb_shinfo(skb)->frags[i]);
1954
1955		if (end < len) {
1956			offset = end;
1957			continue;
1958		}
1959
1960		skb_frag_size_set(&skb_shinfo(skb)->frags[i++], len - offset);
1961
1962drop_pages:
1963		skb_shinfo(skb)->nr_frags = i;
1964
1965		for (; i < nfrags; i++)
1966			skb_frag_unref(skb, i);
1967
1968		if (skb_has_frag_list(skb))
1969			skb_drop_fraglist(skb);
1970		goto done;
1971	}
1972
1973	for (fragp = &skb_shinfo(skb)->frag_list; (frag = *fragp);
1974	     fragp = &frag->next) {
1975		int end = offset + frag->len;
1976
1977		if (skb_shared(frag)) {
1978			struct sk_buff *nfrag;
1979
1980			nfrag = skb_clone(frag, GFP_ATOMIC);
1981			if (unlikely(!nfrag))
1982				return -ENOMEM;
1983
1984			nfrag->next = frag->next;
1985			consume_skb(frag);
1986			frag = nfrag;
1987			*fragp = frag;
1988		}
1989
1990		if (end < len) {
1991			offset = end;
1992			continue;
1993		}
1994
1995		if (end > len &&
1996		    unlikely((err = pskb_trim(frag, len - offset))))
1997			return err;
1998
1999		if (frag->next)
2000			skb_drop_list(&frag->next);
2001		break;
2002	}
2003
2004done:
2005	if (len > skb_headlen(skb)) {
2006		skb->data_len -= skb->len - len;
2007		skb->len       = len;
2008	} else {
2009		skb->len       = len;
2010		skb->data_len  = 0;
2011		skb_set_tail_pointer(skb, len);
2012	}
2013
2014	if (!skb->sk || skb->destructor == sock_edemux)
2015		skb_condense(skb);
2016	return 0;
2017}
2018EXPORT_SYMBOL(___pskb_trim);
2019
2020/* Note : use pskb_trim_rcsum() instead of calling this directly
2021 */
2022int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len)
2023{
2024	if (skb->ip_summed == CHECKSUM_COMPLETE) {
2025		int delta = skb->len - len;
2026
2027		skb->csum = csum_block_sub(skb->csum,
2028					   skb_checksum(skb, len, delta, 0),
2029					   len);
2030	} else if (skb->ip_summed == CHECKSUM_PARTIAL) {
2031		int hdlen = (len > skb_headlen(skb)) ? skb_headlen(skb) : len;
2032		int offset = skb_checksum_start_offset(skb) + skb->csum_offset;
2033
2034		if (offset + sizeof(__sum16) > hdlen)
2035			return -EINVAL;
2036	}
2037	return __pskb_trim(skb, len);
2038}
2039EXPORT_SYMBOL(pskb_trim_rcsum_slow);
2040
2041/**
2042 *	__pskb_pull_tail - advance tail of skb header
2043 *	@skb: buffer to reallocate
2044 *	@delta: number of bytes to advance tail
2045 *
2046 *	The function makes a sense only on a fragmented &sk_buff,
2047 *	it expands header moving its tail forward and copying necessary
2048 *	data from fragmented part.
2049 *
2050 *	&sk_buff MUST have reference count of 1.
2051 *
2052 *	Returns %NULL (and &sk_buff does not change) if pull failed
2053 *	or value of new tail of skb in the case of success.
2054 *
2055 *	All the pointers pointing into skb header may change and must be
2056 *	reloaded after call to this function.
2057 */
2058
2059/* Moves tail of skb head forward, copying data from fragmented part,
2060 * when it is necessary.
2061 * 1. It may fail due to malloc failure.
2062 * 2. It may change skb pointers.
2063 *
2064 * It is pretty complicated. Luckily, it is called only in exceptional cases.
2065 */
2066void *__pskb_pull_tail(struct sk_buff *skb, int delta)
2067{
2068	/* If skb has not enough free space at tail, get new one
2069	 * plus 128 bytes for future expansions. If we have enough
2070	 * room at tail, reallocate without expansion only if skb is cloned.
2071	 */
2072	int i, k, eat = (skb->tail + delta) - skb->end;
2073
2074	if (eat > 0 || skb_cloned(skb)) {
2075		if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0,
2076				     GFP_ATOMIC))
2077			return NULL;
2078	}
2079
2080	BUG_ON(skb_copy_bits(skb, skb_headlen(skb),
2081			     skb_tail_pointer(skb), delta));
2082
2083	/* Optimization: no fragments, no reasons to preestimate
2084	 * size of pulled pages. Superb.
2085	 */
2086	if (!skb_has_frag_list(skb))
2087		goto pull_pages;
2088
2089	/* Estimate size of pulled pages. */
2090	eat = delta;
2091	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2092		int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
2093
2094		if (size >= eat)
2095			goto pull_pages;
2096		eat -= size;
2097	}
2098
2099	/* If we need update frag list, we are in troubles.
2100	 * Certainly, it is possible to add an offset to skb data,
2101	 * but taking into account that pulling is expected to
2102	 * be very rare operation, it is worth to fight against
2103	 * further bloating skb head and crucify ourselves here instead.
2104	 * Pure masohism, indeed. 8)8)
2105	 */
2106	if (eat) {
2107		struct sk_buff *list = skb_shinfo(skb)->frag_list;
2108		struct sk_buff *clone = NULL;
2109		struct sk_buff *insp = NULL;
2110
2111		do {
2112			if (list->len <= eat) {
2113				/* Eaten as whole. */
2114				eat -= list->len;
2115				list = list->next;
2116				insp = list;
2117			} else {
2118				/* Eaten partially. */
2119
2120				if (skb_shared(list)) {
2121					/* Sucks! We need to fork list. :-( */
2122					clone = skb_clone(list, GFP_ATOMIC);
2123					if (!clone)
2124						return NULL;
2125					insp = list->next;
2126					list = clone;
2127				} else {
2128					/* This may be pulled without
2129					 * problems. */
2130					insp = list;
2131				}
2132				if (!pskb_pull(list, eat)) {
2133					kfree_skb(clone);
2134					return NULL;
2135				}
2136				break;
2137			}
2138		} while (eat);
2139
2140		/* Free pulled out fragments. */
2141		while ((list = skb_shinfo(skb)->frag_list) != insp) {
2142			skb_shinfo(skb)->frag_list = list->next;
2143			kfree_skb(list);
2144		}
2145		/* And insert new clone at head. */
2146		if (clone) {
2147			clone->next = list;
2148			skb_shinfo(skb)->frag_list = clone;
2149		}
2150	}
2151	/* Success! Now we may commit changes to skb data. */
2152
2153pull_pages:
2154	eat = delta;
2155	k = 0;
2156	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2157		int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
2158
2159		if (size <= eat) {
2160			skb_frag_unref(skb, i);
2161			eat -= size;
2162		} else {
2163			skb_frag_t *frag = &skb_shinfo(skb)->frags[k];
2164
2165			*frag = skb_shinfo(skb)->frags[i];
2166			if (eat) {
2167				skb_frag_off_add(frag, eat);
2168				skb_frag_size_sub(frag, eat);
2169				if (!i)
2170					goto end;
2171				eat = 0;
2172			}
2173			k++;
2174		}
2175	}
2176	skb_shinfo(skb)->nr_frags = k;
2177
2178end:
2179	skb->tail     += delta;
2180	skb->data_len -= delta;
2181
2182	if (!skb->data_len)
2183		skb_zcopy_clear(skb, false);
2184
2185	return skb_tail_pointer(skb);
2186}
2187EXPORT_SYMBOL(__pskb_pull_tail);
2188
2189/**
2190 *	skb_copy_bits - copy bits from skb to kernel buffer
2191 *	@skb: source skb
2192 *	@offset: offset in source
2193 *	@to: destination buffer
2194 *	@len: number of bytes to copy
2195 *
2196 *	Copy the specified number of bytes from the source skb to the
2197 *	destination buffer.
2198 *
2199 *	CAUTION ! :
2200 *		If its prototype is ever changed,
2201 *		check arch/{*}/net/{*}.S files,
2202 *		since it is called from BPF assembly code.
2203 */
2204int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len)
2205{
2206	int start = skb_headlen(skb);
2207	struct sk_buff *frag_iter;
2208	int i, copy;
2209
2210	if (offset > (int)skb->len - len)
2211		goto fault;
2212
2213	/* Copy header. */
2214	if ((copy = start - offset) > 0) {
2215		if (copy > len)
2216			copy = len;
2217		skb_copy_from_linear_data_offset(skb, offset, to, copy);
2218		if ((len -= copy) == 0)
2219			return 0;
2220		offset += copy;
2221		to     += copy;
2222	}
2223
2224	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2225		int end;
2226		skb_frag_t *f = &skb_shinfo(skb)->frags[i];
2227
2228		WARN_ON(start > offset + len);
2229
2230		end = start + skb_frag_size(f);
2231		if ((copy = end - offset) > 0) {
2232			u32 p_off, p_len, copied;
2233			struct page *p;
2234			u8 *vaddr;
2235
2236			if (copy > len)
2237				copy = len;
2238
2239			skb_frag_foreach_page(f,
2240					      skb_frag_off(f) + offset - start,
2241					      copy, p, p_off, p_len, copied) {
2242				vaddr = kmap_atomic(p);
2243				memcpy(to + copied, vaddr + p_off, p_len);
2244				kunmap_atomic(vaddr);
2245			}
2246
2247			if ((len -= copy) == 0)
2248				return 0;
2249			offset += copy;
2250			to     += copy;
2251		}
2252		start = end;
2253	}
2254
2255	skb_walk_frags(skb, frag_iter) {
2256		int end;
2257
2258		WARN_ON(start > offset + len);
2259
2260		end = start + frag_iter->len;
2261		if ((copy = end - offset) > 0) {
2262			if (copy > len)
2263				copy = len;
2264			if (skb_copy_bits(frag_iter, offset - start, to, copy))
2265				goto fault;
2266			if ((len -= copy) == 0)
2267				return 0;
2268			offset += copy;
2269			to     += copy;
2270		}
2271		start = end;
2272	}
2273
2274	if (!len)
2275		return 0;
2276
2277fault:
2278	return -EFAULT;
2279}
2280EXPORT_SYMBOL(skb_copy_bits);
2281
2282/*
2283 * Callback from splice_to_pipe(), if we need to release some pages
2284 * at the end of the spd in case we error'ed out in filling the pipe.
2285 */
2286static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i)
2287{
2288	put_page(spd->pages[i]);
2289}
2290
2291static struct page *linear_to_page(struct page *page, unsigned int *len,
2292				   unsigned int *offset,
2293				   struct sock *sk)
2294{
2295	struct page_frag *pfrag = sk_page_frag(sk);
2296
2297	if (!sk_page_frag_refill(sk, pfrag))
2298		return NULL;
2299
2300	*len = min_t(unsigned int, *len, pfrag->size - pfrag->offset);
2301
2302	memcpy(page_address(pfrag->page) + pfrag->offset,
2303	       page_address(page) + *offset, *len);
2304	*offset = pfrag->offset;
2305	pfrag->offset += *len;
2306
2307	return pfrag->page;
2308}
2309
2310static bool spd_can_coalesce(const struct splice_pipe_desc *spd,
2311			     struct page *page,
2312			     unsigned int offset)
2313{
2314	return	spd->nr_pages &&
2315		spd->pages[spd->nr_pages - 1] == page &&
2316		(spd->partial[spd->nr_pages - 1].offset +
2317		 spd->partial[spd->nr_pages - 1].len == offset);
2318}
2319
2320/*
2321 * Fill page/offset/length into spd, if it can hold more pages.
2322 */
2323static bool spd_fill_page(struct splice_pipe_desc *spd,
2324			  struct pipe_inode_info *pipe, struct page *page,
2325			  unsigned int *len, unsigned int offset,
2326			  bool linear,
2327			  struct sock *sk)
2328{
2329	if (unlikely(spd->nr_pages == MAX_SKB_FRAGS))
2330		return true;
2331
2332	if (linear) {
2333		page = linear_to_page(page, len, &offset, sk);
2334		if (!page)
2335			return true;
2336	}
2337	if (spd_can_coalesce(spd, page, offset)) {
2338		spd->partial[spd->nr_pages - 1].len += *len;
2339		return false;
2340	}
2341	get_page(page);
2342	spd->pages[spd->nr_pages] = page;
2343	spd->partial[spd->nr_pages].len = *len;
2344	spd->partial[spd->nr_pages].offset = offset;
2345	spd->nr_pages++;
2346
2347	return false;
2348}
2349
2350static bool __splice_segment(struct page *page, unsigned int poff,
2351			     unsigned int plen, unsigned int *off,
2352			     unsigned int *len,
2353			     struct splice_pipe_desc *spd, bool linear,
2354			     struct sock *sk,
2355			     struct pipe_inode_info *pipe)
2356{
2357	if (!*len)
2358		return true;
2359
2360	/* skip this segment if already processed */
2361	if (*off >= plen) {
2362		*off -= plen;
2363		return false;
2364	}
2365
2366	/* ignore any bits we already processed */
2367	poff += *off;
2368	plen -= *off;
2369	*off = 0;
2370
2371	do {
2372		unsigned int flen = min(*len, plen);
2373
2374		if (spd_fill_page(spd, pipe, page, &flen, poff,
2375				  linear, sk))
2376			return true;
2377		poff += flen;
2378		plen -= flen;
2379		*len -= flen;
2380	} while (*len && plen);
2381
2382	return false;
2383}
2384
2385/*
2386 * Map linear and fragment data from the skb to spd. It reports true if the
2387 * pipe is full or if we already spliced the requested length.
2388 */
2389static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
2390			      unsigned int *offset, unsigned int *len,
2391			      struct splice_pipe_desc *spd, struct sock *sk)
2392{
2393	int seg;
2394	struct sk_buff *iter;
2395
2396	/* map the linear part :
2397	 * If skb->head_frag is set, this 'linear' part is backed by a
2398	 * fragment, and if the head is not shared with any clones then
2399	 * we can avoid a copy since we own the head portion of this page.
2400	 */
2401	if (__splice_segment(virt_to_page(skb->data),
2402			     (unsigned long) skb->data & (PAGE_SIZE - 1),
2403			     skb_headlen(skb),
2404			     offset, len, spd,
2405			     skb_head_is_locked(skb),
2406			     sk, pipe))
2407		return true;
2408
2409	/*
2410	 * then map the fragments
2411	 */
2412	for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) {
2413		const skb_frag_t *f = &skb_shinfo(skb)->frags[seg];
2414
2415		if (__splice_segment(skb_frag_page(f),
2416				     skb_frag_off(f), skb_frag_size(f),
2417				     offset, len, spd, false, sk, pipe))
2418			return true;
2419	}
2420
2421	skb_walk_frags(skb, iter) {
2422		if (*offset >= iter->len) {
2423			*offset -= iter->len;
2424			continue;
2425		}
2426		/* __skb_splice_bits() only fails if the output has no room
2427		 * left, so no point in going over the frag_list for the error
2428		 * case.
2429		 */
2430		if (__skb_splice_bits(iter, pipe, offset, len, spd, sk))
2431			return true;
2432	}
2433
2434	return false;
2435}
2436
2437/*
2438 * Map data from the skb to a pipe. Should handle both the linear part,
2439 * the fragments, and the frag list.
2440 */
2441int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset,
2442		    struct pipe_inode_info *pipe, unsigned int tlen,
2443		    unsigned int flags)
2444{
2445	struct partial_page partial[MAX_SKB_FRAGS];
2446	struct page *pages[MAX_SKB_FRAGS];
2447	struct splice_pipe_desc spd = {
2448		.pages = pages,
2449		.partial = partial,
2450		.nr_pages_max = MAX_SKB_FRAGS,
2451		.ops = &nosteal_pipe_buf_ops,
2452		.spd_release = sock_spd_release,
2453	};
2454	int ret = 0;
2455
2456	__skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk);
2457
2458	if (spd.nr_pages)
2459		ret = splice_to_pipe(pipe, &spd);
2460
2461	return ret;
2462}
2463EXPORT_SYMBOL_GPL(skb_splice_bits);
2464
2465/* Send skb data on a socket. Socket must be locked. */
2466int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset,
2467			 int len)
2468{
2469	unsigned int orig_len = len;
2470	struct sk_buff *head = skb;
2471	unsigned short fragidx;
2472	int slen, ret;
2473
2474do_frag_list:
2475
2476	/* Deal with head data */
2477	while (offset < skb_headlen(skb) && len) {
2478		struct kvec kv;
2479		struct msghdr msg;
2480
2481		slen = min_t(int, len, skb_headlen(skb) - offset);
2482		kv.iov_base = skb->data + offset;
2483		kv.iov_len = slen;
2484		memset(&msg, 0, sizeof(msg));
2485		msg.msg_flags = MSG_DONTWAIT;
2486
2487		ret = kernel_sendmsg_locked(sk, &msg, &kv, 1, slen);
2488		if (ret <= 0)
2489			goto error;
2490
2491		offset += ret;
2492		len -= ret;
2493	}
2494
2495	/* All the data was skb head? */
2496	if (!len)
2497		goto out;
2498
2499	/* Make offset relative to start of frags */
2500	offset -= skb_headlen(skb);
2501
2502	/* Find where we are in frag list */
2503	for (fragidx = 0; fragidx < skb_shinfo(skb)->nr_frags; fragidx++) {
2504		skb_frag_t *frag  = &skb_shinfo(skb)->frags[fragidx];
2505
2506		if (offset < skb_frag_size(frag))
2507			break;
2508
2509		offset -= skb_frag_size(frag);
2510	}
2511
2512	for (; len && fragidx < skb_shinfo(skb)->nr_frags; fragidx++) {
2513		skb_frag_t *frag  = &skb_shinfo(skb)->frags[fragidx];
2514
2515		slen = min_t(size_t, len, skb_frag_size(frag) - offset);
2516
2517		while (slen) {
2518			ret = kernel_sendpage_locked(sk, skb_frag_page(frag),
2519						     skb_frag_off(frag) + offset,
2520						     slen, MSG_DONTWAIT);
2521			if (ret <= 0)
2522				goto error;
2523
2524			len -= ret;
2525			offset += ret;
2526			slen -= ret;
2527		}
2528
2529		offset = 0;
2530	}
2531
2532	if (len) {
2533		/* Process any frag lists */
2534
2535		if (skb == head) {
2536			if (skb_has_frag_list(skb)) {
2537				skb = skb_shinfo(skb)->frag_list;
2538				goto do_frag_list;
2539			}
2540		} else if (skb->next) {
2541			skb = skb->next;
2542			goto do_frag_list;
2543		}
2544	}
2545
2546out:
2547	return orig_len - len;
2548
2549error:
2550	return orig_len == len ? ret : orig_len - len;
2551}
2552EXPORT_SYMBOL_GPL(skb_send_sock_locked);
2553
2554/**
2555 *	skb_store_bits - store bits from kernel buffer to skb
2556 *	@skb: destination buffer
2557 *	@offset: offset in destination
2558 *	@from: source buffer
2559 *	@len: number of bytes to copy
2560 *
2561 *	Copy the specified number of bytes from the source buffer to the
2562 *	destination skb.  This function handles all the messy bits of
2563 *	traversing fragment lists and such.
2564 */
2565
2566int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len)
2567{
2568	int start = skb_headlen(skb);
2569	struct sk_buff *frag_iter;
2570	int i, copy;
2571
2572	if (offset > (int)skb->len - len)
2573		goto fault;
2574
2575	if ((copy = start - offset) > 0) {
2576		if (copy > len)
2577			copy = len;
2578		skb_copy_to_linear_data_offset(skb, offset, from, copy);
2579		if ((len -= copy) == 0)
2580			return 0;
2581		offset += copy;
2582		from += copy;
2583	}
2584
2585	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2586		skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2587		int end;
2588
2589		WARN_ON(start > offset + len);
2590
2591		end = start + skb_frag_size(frag);
2592		if ((copy = end - offset) > 0) {
2593			u32 p_off, p_len, copied;
2594			struct page *p;
2595			u8 *vaddr;
2596
2597			if (copy > len)
2598				copy = len;
2599
2600			skb_frag_foreach_page(frag,
2601					      skb_frag_off(frag) + offset - start,
2602					      copy, p, p_off, p_len, copied) {
2603				vaddr = kmap_atomic(p);
2604				memcpy(vaddr + p_off, from + copied, p_len);
2605				kunmap_atomic(vaddr);
2606			}
2607
2608			if ((len -= copy) == 0)
2609				return 0;
2610			offset += copy;
2611			from += copy;
2612		}
2613		start = end;
2614	}
2615
2616	skb_walk_frags(skb, frag_iter) {
2617		int end;
2618
2619		WARN_ON(start > offset + len);
2620
2621		end = start + frag_iter->len;
2622		if ((copy = end - offset) > 0) {
2623			if (copy > len)
2624				copy = len;
2625			if (skb_store_bits(frag_iter, offset - start,
2626					   from, copy))
2627				goto fault;
2628			if ((len -= copy) == 0)
2629				return 0;
2630			offset += copy;
2631			from += copy;
2632		}
2633		start = end;
2634	}
2635	if (!len)
2636		return 0;
2637
2638fault:
2639	return -EFAULT;
2640}
2641EXPORT_SYMBOL(skb_store_bits);
2642
2643/* Checksum skb data. */
2644__wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
2645		      __wsum csum, const struct skb_checksum_ops *ops)
2646{
2647	int start = skb_headlen(skb);
2648	int i, copy = start - offset;
2649	struct sk_buff *frag_iter;
2650	int pos = 0;
2651
2652	/* Checksum header. */
2653	if (copy > 0) {
2654		if (copy > len)
2655			copy = len;
2656		csum = INDIRECT_CALL_1(ops->update, csum_partial_ext,
2657				       skb->data + offset, copy, csum);
2658		if ((len -= copy) == 0)
2659			return csum;
2660		offset += copy;
2661		pos	= copy;
2662	}
2663
2664	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2665		int end;
2666		skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2667
2668		WARN_ON(start > offset + len);
2669
2670		end = start + skb_frag_size(frag);
2671		if ((copy = end - offset) > 0) {
2672			u32 p_off, p_len, copied;
2673			struct page *p;
2674			__wsum csum2;
2675			u8 *vaddr;
2676
2677			if (copy > len)
2678				copy = len;
2679
2680			skb_frag_foreach_page(frag,
2681					      skb_frag_off(frag) + offset - start,
2682					      copy, p, p_off, p_len, copied) {
2683				vaddr = kmap_atomic(p);
2684				csum2 = INDIRECT_CALL_1(ops->update,
2685							csum_partial_ext,
2686							vaddr + p_off, p_len, 0);
2687				kunmap_atomic(vaddr);
2688				csum = INDIRECT_CALL_1(ops->combine,
2689						       csum_block_add_ext, csum,
2690						       csum2, pos, p_len);
2691				pos += p_len;
2692			}
2693
2694			if (!(len -= copy))
2695				return csum;
2696			offset += copy;
2697		}
2698		start = end;
2699	}
2700
2701	skb_walk_frags(skb, frag_iter) {
2702		int end;
2703
2704		WARN_ON(start > offset + len);
2705
2706		end = start + frag_iter->len;
2707		if ((copy = end - offset) > 0) {
2708			__wsum csum2;
2709			if (copy > len)
2710				copy = len;
2711			csum2 = __skb_checksum(frag_iter, offset - start,
2712					       copy, 0, ops);
2713			csum = INDIRECT_CALL_1(ops->combine, csum_block_add_ext,
2714					       csum, csum2, pos, copy);
2715			if ((len -= copy) == 0)
2716				return csum;
2717			offset += copy;
2718			pos    += copy;
2719		}
2720		start = end;
2721	}
2722	BUG_ON(len);
2723
2724	return csum;
2725}
2726EXPORT_SYMBOL(__skb_checksum);
2727
2728__wsum skb_checksum(const struct sk_buff *skb, int offset,
2729		    int len, __wsum csum)
2730{
2731	const struct skb_checksum_ops ops = {
2732		.update  = csum_partial_ext,
2733		.combine = csum_block_add_ext,
2734	};
2735
2736	return __skb_checksum(skb, offset, len, csum, &ops);
2737}
2738EXPORT_SYMBOL(skb_checksum);
2739
2740/* Both of above in one bottle. */
2741
2742__wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
2743				    u8 *to, int len)
2744{
2745	int start = skb_headlen(skb);
2746	int i, copy = start - offset;
2747	struct sk_buff *frag_iter;
2748	int pos = 0;
2749	__wsum csum = 0;
2750
2751	/* Copy header. */
2752	if (copy > 0) {
2753		if (copy > len)
2754			copy = len;
2755		csum = csum_partial_copy_nocheck(skb->data + offset, to,
2756						 copy);
2757		if ((len -= copy) == 0)
2758			return csum;
2759		offset += copy;
2760		to     += copy;
2761		pos	= copy;
2762	}
2763
2764	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2765		int end;
2766
2767		WARN_ON(start > offset + len);
2768
2769		end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]);
2770		if ((copy = end - offset) > 0) {
2771			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2772			u32 p_off, p_len, copied;
2773			struct page *p;
2774			__wsum csum2;
2775			u8 *vaddr;
2776
2777			if (copy > len)
2778				copy = len;
2779
2780			skb_frag_foreach_page(frag,
2781					      skb_frag_off(frag) + offset - start,
2782					      copy, p, p_off, p_len, copied) {
2783				vaddr = kmap_atomic(p);
2784				csum2 = csum_partial_copy_nocheck(vaddr + p_off,
2785								  to + copied,
2786								  p_len);
2787				kunmap_atomic(vaddr);
2788				csum = csum_block_add(csum, csum2, pos);
2789				pos += p_len;
2790			}
2791
2792			if (!(len -= copy))
2793				return csum;
2794			offset += copy;
2795			to     += copy;
2796		}
2797		start = end;
2798	}
2799
2800	skb_walk_frags(skb, frag_iter) {
2801		__wsum csum2;
2802		int end;
2803
2804		WARN_ON(start > offset + len);
2805
2806		end = start + frag_iter->len;
2807		if ((copy = end - offset) > 0) {
2808			if (copy > len)
2809				copy = len;
2810			csum2 = skb_copy_and_csum_bits(frag_iter,
2811						       offset - start,
2812						       to, copy);
2813			csum = csum_block_add(csum, csum2, pos);
2814			if ((len -= copy) == 0)
2815				return csum;
2816			offset += copy;
2817			to     += copy;
2818			pos    += copy;
2819		}
2820		start = end;
2821	}
2822	BUG_ON(len);
2823	return csum;
2824}
2825EXPORT_SYMBOL(skb_copy_and_csum_bits);
2826
2827__sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len)
2828{
2829	__sum16 sum;
2830
2831	sum = csum_fold(skb_checksum(skb, 0, len, skb->csum));
2832	/* See comments in __skb_checksum_complete(). */
2833	if (likely(!sum)) {
2834		if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
2835		    !skb->csum_complete_sw)
2836			netdev_rx_csum_fault(skb->dev, skb);
2837	}
2838	if (!skb_shared(skb))
2839		skb->csum_valid = !sum;
2840	return sum;
2841}
2842EXPORT_SYMBOL(__skb_checksum_complete_head);
2843
2844/* This function assumes skb->csum already holds pseudo header's checksum,
2845 * which has been changed from the hardware checksum, for example, by
2846 * __skb_checksum_validate_complete(). And, the original skb->csum must
2847 * have been validated unsuccessfully for CHECKSUM_COMPLETE case.
2848 *
2849 * It returns non-zero if the recomputed checksum is still invalid, otherwise
2850 * zero. The new checksum is stored back into skb->csum unless the skb is
2851 * shared.
2852 */
2853__sum16 __skb_checksum_complete(struct sk_buff *skb)
2854{
2855	__wsum csum;
2856	__sum16 sum;
2857
2858	csum = skb_checksum(skb, 0, skb->len, 0);
2859
2860	sum = csum_fold(csum_add(skb->csum, csum));
2861	/* This check is inverted, because we already knew the hardware
2862	 * checksum is invalid before calling this function. So, if the
2863	 * re-computed checksum is valid instead, then we have a mismatch
2864	 * between the original skb->csum and skb_checksum(). This means either
2865	 * the original hardware checksum is incorrect or we screw up skb->csum
2866	 * when moving skb->data around.
2867	 */
2868	if (likely(!sum)) {
2869		if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
2870		    !skb->csum_complete_sw)
2871			netdev_rx_csum_fault(skb->dev, skb);
2872	}
2873
2874	if (!skb_shared(skb)) {
2875		/* Save full packet checksum */
2876		skb->csum = csum;
2877		skb->ip_summed = CHECKSUM_COMPLETE;
2878		skb->csum_complete_sw = 1;
2879		skb->csum_valid = !sum;
2880	}
2881
2882	return sum;
2883}
2884EXPORT_SYMBOL(__skb_checksum_complete);
2885
2886static __wsum warn_crc32c_csum_update(const void *buff, int len, __wsum sum)
2887{
2888	net_warn_ratelimited(
2889		"%s: attempt to compute crc32c without libcrc32c.ko\n",
2890		__func__);
2891	return 0;
2892}
2893
2894static __wsum warn_crc32c_csum_combine(__wsum csum, __wsum csum2,
2895				       int offset, int len)
2896{
2897	net_warn_ratelimited(
2898		"%s: attempt to compute crc32c without libcrc32c.ko\n",
2899		__func__);
2900	return 0;
2901}
2902
2903static const struct skb_checksum_ops default_crc32c_ops = {
2904	.update  = warn_crc32c_csum_update,
2905	.combine = warn_crc32c_csum_combine,
2906};
2907
2908const struct skb_checksum_ops *crc32c_csum_stub __read_mostly =
2909	&default_crc32c_ops;
2910EXPORT_SYMBOL(crc32c_csum_stub);
2911
2912 /**
2913 *	skb_zerocopy_headlen - Calculate headroom needed for skb_zerocopy()
2914 *	@from: source buffer
2915 *
2916 *	Calculates the amount of linear headroom needed in the 'to' skb passed
2917 *	into skb_zerocopy().
2918 */
2919unsigned int
2920skb_zerocopy_headlen(const struct sk_buff *from)
2921{
2922	unsigned int hlen = 0;
2923
2924	if (!from->head_frag ||
2925	    skb_headlen(from) < L1_CACHE_BYTES ||
2926	    skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS)
2927		hlen = skb_headlen(from);
2928
2929	if (skb_has_frag_list(from))
2930		hlen = from->len;
2931
2932	return hlen;
2933}
2934EXPORT_SYMBOL_GPL(skb_zerocopy_headlen);
2935
2936/**
2937 *	skb_zerocopy - Zero copy skb to skb
2938 *	@to: destination buffer
2939 *	@from: source buffer
2940 *	@len: number of bytes to copy from source buffer
2941 *	@hlen: size of linear headroom in destination buffer
2942 *
2943 *	Copies up to `len` bytes from `from` to `to` by creating references
2944 *	to the frags in the source buffer.
2945 *
2946 *	The `hlen` as calculated by skb_zerocopy_headlen() specifies the
2947 *	headroom in the `to` buffer.
2948 *
2949 *	Return value:
2950 *	0: everything is OK
2951 *	-ENOMEM: couldn't orphan frags of @from due to lack of memory
2952 *	-EFAULT: skb_copy_bits() found some problem with skb geometry
2953 */
2954int
2955skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen)
2956{
2957	int i, j = 0;
2958	int plen = 0; /* length of skb->head fragment */
2959	int ret;
2960	struct page *page;
2961	unsigned int offset;
2962
2963	BUG_ON(!from->head_frag && !hlen);
2964
2965	/* dont bother with small payloads */
2966	if (len <= skb_tailroom(to))
2967		return skb_copy_bits(from, 0, skb_put(to, len), len);
2968
2969	if (hlen) {
2970		ret = skb_copy_bits(from, 0, skb_put(to, hlen), hlen);
2971		if (unlikely(ret))
2972			return ret;
2973		len -= hlen;
2974	} else {
2975		plen = min_t(int, skb_headlen(from), len);
2976		if (plen) {
2977			page = virt_to_head_page(from->head);
2978			offset = from->data - (unsigned char *)page_address(page);
2979			__skb_fill_page_desc(to, 0, page, offset, plen);
2980			get_page(page);
2981			j = 1;
2982			len -= plen;
2983		}
2984	}
2985
2986	to->truesize += len + plen;
2987	to->len += len + plen;
2988	to->data_len += len + plen;
2989
2990	if (unlikely(skb_orphan_frags(from, GFP_ATOMIC))) {
2991		skb_tx_error(from);
2992		return -ENOMEM;
2993	}
2994	skb_zerocopy_clone(to, from, GFP_ATOMIC);
2995
2996	for (i = 0; i < skb_shinfo(from)->nr_frags; i++) {
2997		int size;
2998
2999		if (!len)
3000			break;
3001		skb_shinfo(to)->frags[j] = skb_shinfo(from)->frags[i];
3002		size = min_t(int, skb_frag_size(&skb_shinfo(to)->frags[j]),
3003					len);
3004		skb_frag_size_set(&skb_shinfo(to)->frags[j], size);
3005		len -= size;
3006		skb_frag_ref(to, j);
3007		j++;
3008	}
3009	skb_shinfo(to)->nr_frags = j;
3010
3011	return 0;
3012}
3013EXPORT_SYMBOL_GPL(skb_zerocopy);
3014
3015void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to)
3016{
3017	__wsum csum;
3018	long csstart;
3019
3020	if (skb->ip_summed == CHECKSUM_PARTIAL)
3021		csstart = skb_checksum_start_offset(skb);
3022	else
3023		csstart = skb_headlen(skb);
3024
3025	BUG_ON(csstart > skb_headlen(skb));
3026
3027	skb_copy_from_linear_data(skb, to, csstart);
3028
3029	csum = 0;
3030	if (csstart != skb->len)
3031		csum = skb_copy_and_csum_bits(skb, csstart, to + csstart,
3032					      skb->len - csstart);
3033
3034	if (skb->ip_summed == CHECKSUM_PARTIAL) {
3035		long csstuff = csstart + skb->csum_offset;
3036
3037		*((__sum16 *)(to + csstuff)) = csum_fold(csum);
3038	}
3039}
3040EXPORT_SYMBOL(skb_copy_and_csum_dev);
3041
3042/**
3043 *	skb_dequeue - remove from the head of the queue
3044 *	@list: list to dequeue from
3045 *
3046 *	Remove the head of the list. The list lock is taken so the function
3047 *	may be used safely with other locking list functions. The head item is
3048 *	returned or %NULL if the list is empty.
3049 */
3050
3051struct sk_buff *skb_dequeue(struct sk_buff_head *list)
3052{
3053	unsigned long flags;
3054	struct sk_buff *result;
3055
3056	spin_lock_irqsave(&list->lock, flags);
3057	result = __skb_dequeue(list);
3058	spin_unlock_irqrestore(&list->lock, flags);
3059	return result;
3060}
3061EXPORT_SYMBOL(skb_dequeue);
3062
3063/**
3064 *	skb_dequeue_tail - remove from the tail of the queue
3065 *	@list: list to dequeue from
3066 *
3067 *	Remove the tail of the list. The list lock is taken so the function
3068 *	may be used safely with other locking list functions. The tail item is
3069 *	returned or %NULL if the list is empty.
3070 */
3071struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list)
3072{
3073	unsigned long flags;
3074	struct sk_buff *result;
3075
3076	spin_lock_irqsave(&list->lock, flags);
3077	result = __skb_dequeue_tail(list);
3078	spin_unlock_irqrestore(&list->lock, flags);
3079	return result;
3080}
3081EXPORT_SYMBOL(skb_dequeue_tail);
3082
3083/**
3084 *	skb_queue_purge - empty a list
3085 *	@list: list to empty
3086 *
3087 *	Delete all buffers on an &sk_buff list. Each buffer is removed from
3088 *	the list and one reference dropped. This function takes the list
3089 *	lock and is atomic with respect to other list locking functions.
3090 */
3091void skb_queue_purge(struct sk_buff_head *list)
3092{
3093	struct sk_buff *skb;
3094	while ((skb = skb_dequeue(list)) != NULL)
3095		kfree_skb(skb);
3096}
3097EXPORT_SYMBOL(skb_queue_purge);
3098
3099/**
3100 *	skb_rbtree_purge - empty a skb rbtree
3101 *	@root: root of the rbtree to empty
3102 *	Return value: the sum of truesizes of all purged skbs.
3103 *
3104 *	Delete all buffers on an &sk_buff rbtree. Each buffer is removed from
3105 *	the list and one reference dropped. This function does not take
3106 *	any lock. Synchronization should be handled by the caller (e.g., TCP
3107 *	out-of-order queue is protected by the socket lock).
3108 */
3109unsigned int skb_rbtree_purge(struct rb_root *root)
3110{
3111	struct rb_node *p = rb_first(root);
3112	unsigned int sum = 0;
3113
3114	while (p) {
3115		struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode);
3116
3117		p = rb_next(p);
3118		rb_erase(&skb->rbnode, root);
3119		sum += skb->truesize;
3120		kfree_skb(skb);
3121	}
3122	return sum;
3123}
3124
3125/**
3126 *	skb_queue_head - queue a buffer at the list head
3127 *	@list: list to use
3128 *	@newsk: buffer to queue
3129 *
3130 *	Queue a buffer at the start of the list. This function takes the
3131 *	list lock and can be used safely with other locking &sk_buff functions
3132 *	safely.
3133 *
3134 *	A buffer cannot be placed on two lists at the same time.
3135 */
3136void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk)
3137{
3138	unsigned long flags;
3139
3140	spin_lock_irqsave(&list->lock, flags);
3141	__skb_queue_head(list, newsk);
3142	spin_unlock_irqrestore(&list->lock, flags);
3143}
3144EXPORT_SYMBOL(skb_queue_head);
3145
3146/**
3147 *	skb_queue_tail - queue a buffer at the list tail
3148 *	@list: list to use
3149 *	@newsk: buffer to queue
3150 *
3151 *	Queue a buffer at the tail of the list. This function takes the
3152 *	list lock and can be used safely with other locking &sk_buff functions
3153 *	safely.
3154 *
3155 *	A buffer cannot be placed on two lists at the same time.
3156 */
3157void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk)
3158{
3159	unsigned long flags;
3160
3161	spin_lock_irqsave(&list->lock, flags);
3162	__skb_queue_tail(list, newsk);
3163	spin_unlock_irqrestore(&list->lock, flags);
3164}
3165EXPORT_SYMBOL(skb_queue_tail);
3166
3167/**
3168 *	skb_unlink	-	remove a buffer from a list
3169 *	@skb: buffer to remove
3170 *	@list: list to use
3171 *
3172 *	Remove a packet from a list. The list locks are taken and this
3173 *	function is atomic with respect to other list locked calls
3174 *
3175 *	You must know what list the SKB is on.
3176 */
3177void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list)
3178{
3179	unsigned long flags;
3180
3181	spin_lock_irqsave(&list->lock, flags);
3182	__skb_unlink(skb, list);
3183	spin_unlock_irqrestore(&list->lock, flags);
3184}
3185EXPORT_SYMBOL(skb_unlink);
3186
3187/**
3188 *	skb_append	-	append a buffer
3189 *	@old: buffer to insert after
3190 *	@newsk: buffer to insert
3191 *	@list: list to use
3192 *
3193 *	Place a packet after a given packet in a list. The list locks are taken
3194 *	and this function is atomic with respect to other list locked calls.
3195 *	A buffer cannot be placed on two lists at the same time.
3196 */
3197void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list)
3198{
3199	unsigned long flags;
3200
3201	spin_lock_irqsave(&list->lock, flags);
3202	__skb_queue_after(list, old, newsk);
3203	spin_unlock_irqrestore(&list->lock, flags);
3204}
3205EXPORT_SYMBOL(skb_append);
3206
3207static inline void skb_split_inside_header(struct sk_buff *skb,
3208					   struct sk_buff* skb1,
3209					   const u32 len, const int pos)
3210{
3211	int i;
3212
3213	skb_copy_from_linear_data_offset(skb, len, skb_put(skb1, pos - len),
3214					 pos - len);
3215	/* And move data appendix as is. */
3216	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
3217		skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i];
3218
3219	skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags;
3220	skb_shinfo(skb)->nr_frags  = 0;
3221	skb1->data_len		   = skb->data_len;
3222	skb1->len		   += skb1->data_len;
3223	skb->data_len		   = 0;
3224	skb->len		   = len;
3225	skb_set_tail_pointer(skb, len);
3226}
3227
3228static inline void skb_split_no_header(struct sk_buff *skb,
3229				       struct sk_buff* skb1,
3230				       const u32 len, int pos)
3231{
3232	int i, k = 0;
3233	const int nfrags = skb_shinfo(skb)->nr_frags;
3234
3235	skb_shinfo(skb)->nr_frags = 0;
3236	skb1->len		  = skb1->data_len = skb->len - len;
3237	skb->len		  = len;
3238	skb->data_len		  = len - pos;
3239
3240	for (i = 0; i < nfrags; i++) {
3241		int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
3242
3243		if (pos + size > len) {
3244			skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i];
3245
3246			if (pos < len) {
3247				/* Split frag.
3248				 * We have two variants in this case:
3249				 * 1. Move all the frag to the second
3250				 *    part, if it is possible. F.e.
3251				 *    this approach is mandatory for TUX,
3252				 *    where splitting is expensive.
3253				 * 2. Split is accurately. We make this.
3254				 */
3255				skb_frag_ref(skb, i);
3256				skb_frag_off_add(&skb_shinfo(skb1)->frags[0], len - pos);
3257				skb_frag_size_sub(&skb_shinfo(skb1)->frags[0], len - pos);
3258				skb_frag_size_set(&skb_shinfo(skb)->frags[i], len - pos);
3259				skb_shinfo(skb)->nr_frags++;
3260			}
3261			k++;
3262		} else
3263			skb_shinfo(skb)->nr_frags++;
3264		pos += size;
3265	}
3266	skb_shinfo(skb1)->nr_frags = k;
3267}
3268
3269/**
3270 * skb_split - Split fragmented skb to two parts at length len.
3271 * @skb: the buffer to split
3272 * @skb1: the buffer to receive the second part
3273 * @len: new length for skb
3274 */
3275void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len)
3276{
3277	int pos = skb_headlen(skb);
3278
3279	skb_shinfo(skb1)->tx_flags |= skb_shinfo(skb)->tx_flags &
3280				      SKBTX_SHARED_FRAG;
3281	skb_zerocopy_clone(skb1, skb, 0);
3282	if (len < pos)	/* Split line is inside header. */
3283		skb_split_inside_header(skb, skb1, len, pos);
3284	else		/* Second chunk has no header, nothing to copy. */
3285		skb_split_no_header(skb, skb1, len, pos);
3286}
3287EXPORT_SYMBOL(skb_split);
3288
3289/* Shifting from/to a cloned skb is a no-go.
3290 *
3291 * Caller cannot keep skb_shinfo related pointers past calling here!
3292 */
3293static int skb_prepare_for_shift(struct sk_buff *skb)
3294{
3295	return skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
3296}
3297
3298/**
3299 * skb_shift - Shifts paged data partially from skb to another
3300 * @tgt: buffer into which tail data gets added
3301 * @skb: buffer from which the paged data comes from
3302 * @shiftlen: shift up to this many bytes
3303 *
3304 * Attempts to shift up to shiftlen worth of bytes, which may be less than
3305 * the length of the skb, from skb to tgt. Returns number bytes shifted.
3306 * It's up to caller to free skb if everything was shifted.
3307 *
3308 * If @tgt runs out of frags, the whole operation is aborted.
3309 *
3310 * Skb cannot include anything else but paged data while tgt is allowed
3311 * to have non-paged data as well.
3312 *
3313 * TODO: full sized shift could be optimized but that would need
3314 * specialized skb free'er to handle frags without up-to-date nr_frags.
3315 */
3316int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
3317{
3318	int from, to, merge, todo;
3319	skb_frag_t *fragfrom, *fragto;
3320
3321	BUG_ON(shiftlen > skb->len);
3322
3323	if (skb_headlen(skb))
3324		return 0;
3325	if (skb_zcopy(tgt) || skb_zcopy(skb))
3326		return 0;
3327
3328	todo = shiftlen;
3329	from = 0;
3330	to = skb_shinfo(tgt)->nr_frags;
3331	fragfrom = &skb_shinfo(skb)->frags[from];
3332
3333	/* Actual merge is delayed until the point when we know we can
3334	 * commit all, so that we don't have to undo partial changes
3335	 */
3336	if (!to ||
3337	    !skb_can_coalesce(tgt, to, skb_frag_page(fragfrom),
3338			      skb_frag_off(fragfrom))) {
3339		merge = -1;
3340	} else {
3341		merge = to - 1;
3342
3343		todo -= skb_frag_size(fragfrom);
3344		if (todo < 0) {
3345			if (skb_prepare_for_shift(skb) ||
3346			    skb_prepare_for_shift(tgt))
3347				return 0;
3348
3349			/* All previous frag pointers might be stale! */
3350			fragfrom = &skb_shinfo(skb)->frags[from];
3351			fragto = &skb_shinfo(tgt)->frags[merge];
3352
3353			skb_frag_size_add(fragto, shiftlen);
3354			skb_frag_size_sub(fragfrom, shiftlen);
3355			skb_frag_off_add(fragfrom, shiftlen);
3356
3357			goto onlymerged;
3358		}
3359
3360		from++;
3361	}
3362
3363	/* Skip full, not-fitting skb to avoid expensive operations */
3364	if ((shiftlen == skb->len) &&
3365	    (skb_shinfo(skb)->nr_frags - from) > (MAX_SKB_FRAGS - to))
3366		return 0;
3367
3368	if (skb_prepare_for_shift(skb) || skb_prepare_for_shift(tgt))
3369		return 0;
3370
3371	while ((todo > 0) && (from < skb_shinfo(skb)->nr_frags)) {
3372		if (to == MAX_SKB_FRAGS)
3373			return 0;
3374
3375		fragfrom = &skb_shinfo(skb)->frags[from];
3376		fragto = &skb_shinfo(tgt)->frags[to];
3377
3378		if (todo >= skb_frag_size(fragfrom)) {
3379			*fragto = *fragfrom;
3380			todo -= skb_frag_size(fragfrom);
3381			from++;
3382			to++;
3383
3384		} else {
3385			__skb_frag_ref(fragfrom);
3386			skb_frag_page_copy(fragto, fragfrom);
3387			skb_frag_off_copy(fragto, fragfrom);
3388			skb_frag_size_set(fragto, todo);
3389
3390			skb_frag_off_add(fragfrom, todo);
3391			skb_frag_size_sub(fragfrom, todo);
3392			todo = 0;
3393
3394			to++;
3395			break;
3396		}
3397	}
3398
3399	/* Ready to "commit" this state change to tgt */
3400	skb_shinfo(tgt)->nr_frags = to;
3401
3402	if (merge >= 0) {
3403		fragfrom = &skb_shinfo(skb)->frags[0];
3404		fragto = &skb_shinfo(tgt)->frags[merge];
3405
3406		skb_frag_size_add(fragto, skb_frag_size(fragfrom));
3407		__skb_frag_unref(fragfrom);
3408	}
3409
3410	/* Reposition in the original skb */
3411	to = 0;
3412	while (from < skb_shinfo(skb)->nr_frags)
3413		skb_shinfo(skb)->frags[to++] = skb_shinfo(skb)->frags[from++];
3414	skb_shinfo(skb)->nr_frags = to;
3415
3416	BUG_ON(todo > 0 && !skb_shinfo(skb)->nr_frags);
3417
3418onlymerged:
3419	/* Most likely the tgt won't ever need its checksum anymore, skb on
3420	 * the other hand might need it if it needs to be resent
3421	 */
3422	tgt->ip_summed = CHECKSUM_PARTIAL;
3423	skb->ip_summed = CHECKSUM_PARTIAL;
3424
3425	/* Yak, is it really working this way? Some helper please? */
3426	skb->len -= shiftlen;
3427	skb->data_len -= shiftlen;
3428	skb->truesize -= shiftlen;
3429	tgt->len += shiftlen;
3430	tgt->data_len += shiftlen;
3431	tgt->truesize += shiftlen;
3432
3433	return shiftlen;
3434}
3435
3436/**
3437 * skb_prepare_seq_read - Prepare a sequential read of skb data
3438 * @skb: the buffer to read
3439 * @from: lower offset of data to be read
3440 * @to: upper offset of data to be read
3441 * @st: state variable
3442 *
3443 * Initializes the specified state variable. Must be called before
3444 * invoking skb_seq_read() for the first time.
3445 */
3446void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from,
3447			  unsigned int to, struct skb_seq_state *st)
3448{
3449	st->lower_offset = from;
3450	st->upper_offset = to;
3451	st->root_skb = st->cur_skb = skb;
3452	st->frag_idx = st->stepped_offset = 0;
3453	st->frag_data = NULL;
3454	st->frag_off = 0;
3455}
3456EXPORT_SYMBOL(skb_prepare_seq_read);
3457
3458/**
3459 * skb_seq_read - Sequentially read skb data
3460 * @consumed: number of bytes consumed by the caller so far
3461 * @data: destination pointer for data to be returned
3462 * @st: state variable
3463 *
3464 * Reads a block of skb data at @consumed relative to the
3465 * lower offset specified to skb_prepare_seq_read(). Assigns
3466 * the head of the data block to @data and returns the length
3467 * of the block or 0 if the end of the skb data or the upper
3468 * offset has been reached.
3469 *
3470 * The caller is not required to consume all of the data
3471 * returned, i.e. @consumed is typically set to the number
3472 * of bytes already consumed and the next call to
3473 * skb_seq_read() will return the remaining part of the block.
3474 *
3475 * Note 1: The size of each block of data returned can be arbitrary,
3476 *       this limitation is the cost for zerocopy sequential
3477 *       reads of potentially non linear data.
3478 *
3479 * Note 2: Fragment lists within fragments are not implemented
3480 *       at the moment, state->root_skb could be replaced with
3481 *       a stack for this purpose.
3482 */
3483unsigned int skb_seq_read(unsigned int consumed, const u8 **data,
3484			  struct skb_seq_state *st)
3485{
3486	unsigned int block_limit, abs_offset = consumed + st->lower_offset;
3487	skb_frag_t *frag;
3488
3489	if (unlikely(abs_offset >= st->upper_offset)) {
3490		if (st->frag_data) {
3491			kunmap_atomic(st->frag_data);
3492			st->frag_data = NULL;
3493		}
3494		return 0;
3495	}
3496
3497next_skb:
3498	block_limit = skb_headlen(st->cur_skb) + st->stepped_offset;
3499
3500	if (abs_offset < block_limit && !st->frag_data) {
3501		*data = st->cur_skb->data + (abs_offset - st->stepped_offset);
3502		return block_limit - abs_offset;
3503	}
3504
3505	if (st->frag_idx == 0 && !st->frag_data)
3506		st->stepped_offset += skb_headlen(st->cur_skb);
3507
3508	while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) {
3509		unsigned int pg_idx, pg_off, pg_sz;
3510
3511		frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx];
3512
3513		pg_idx = 0;
3514		pg_off = skb_frag_off(frag);
3515		pg_sz = skb_frag_size(frag);
3516
3517		if (skb_frag_must_loop(skb_frag_page(frag))) {
3518			pg_idx = (pg_off + st->frag_off) >> PAGE_SHIFT;
3519			pg_off = offset_in_page(pg_off + st->frag_off);
3520			pg_sz = min_t(unsigned int, pg_sz - st->frag_off,
3521						    PAGE_SIZE - pg_off);
3522		}
3523
3524		block_limit = pg_sz + st->stepped_offset;
3525		if (abs_offset < block_limit) {
3526			if (!st->frag_data)
3527				st->frag_data = kmap_atomic(skb_frag_page(frag) + pg_idx);
3528
3529			*data = (u8 *)st->frag_data + pg_off +
3530				(abs_offset - st->stepped_offset);
3531
3532			return block_limit - abs_offset;
3533		}
3534
3535		if (st->frag_data) {
3536			kunmap_atomic(st->frag_data);
3537			st->frag_data = NULL;
3538		}
3539
3540		st->stepped_offset += pg_sz;
3541		st->frag_off += pg_sz;
3542		if (st->frag_off == skb_frag_size(frag)) {
3543			st->frag_off = 0;
3544			st->frag_idx++;
3545		}
3546	}
3547
3548	if (st->frag_data) {
3549		kunmap_atomic(st->frag_data);
3550		st->frag_data = NULL;
3551	}
3552
3553	if (st->root_skb == st->cur_skb && skb_has_frag_list(st->root_skb)) {
3554		st->cur_skb = skb_shinfo(st->root_skb)->frag_list;
3555		st->frag_idx = 0;
3556		goto next_skb;
3557	} else if (st->cur_skb->next) {
3558		st->cur_skb = st->cur_skb->next;
3559		st->frag_idx = 0;
3560		goto next_skb;
3561	}
3562
3563	return 0;
3564}
3565EXPORT_SYMBOL(skb_seq_read);
3566
3567/**
3568 * skb_abort_seq_read - Abort a sequential read of skb data
3569 * @st: state variable
3570 *
3571 * Must be called if skb_seq_read() was not called until it
3572 * returned 0.
3573 */
3574void skb_abort_seq_read(struct skb_seq_state *st)
3575{
3576	if (st->frag_data)
3577		kunmap_atomic(st->frag_data);
3578}
3579EXPORT_SYMBOL(skb_abort_seq_read);
3580
3581#define TS_SKB_CB(state)	((struct skb_seq_state *) &((state)->cb))
3582
3583static unsigned int skb_ts_get_next_block(unsigned int offset, const u8 **text,
3584					  struct ts_config *conf,
3585					  struct ts_state *state)
3586{
3587	return skb_seq_read(offset, text, TS_SKB_CB(state));
3588}
3589
3590static void skb_ts_finish(struct ts_config *conf, struct ts_state *state)
3591{
3592	skb_abort_seq_read(TS_SKB_CB(state));
3593}
3594
3595/**
3596 * skb_find_text - Find a text pattern in skb data
3597 * @skb: the buffer to look in
3598 * @from: search offset
3599 * @to: search limit
3600 * @config: textsearch configuration
3601 *
3602 * Finds a pattern in the skb data according to the specified
3603 * textsearch configuration. Use textsearch_next() to retrieve
3604 * subsequent occurrences of the pattern. Returns the offset
3605 * to the first occurrence or UINT_MAX if no match was found.
3606 */
3607unsigned int skb_find_text(struct sk_buff *skb, unsigned int from,
3608			   unsigned int to, struct ts_config *config)
3609{
3610	struct ts_state state;
3611	unsigned int ret;
3612
3613	config->get_next_block = skb_ts_get_next_block;
3614	config->finish = skb_ts_finish;
3615
3616	skb_prepare_seq_read(skb, from, to, TS_SKB_CB(&state));
3617
3618	ret = textsearch_find(config, &state);
3619	return (ret <= to - from ? ret : UINT_MAX);
3620}
3621EXPORT_SYMBOL(skb_find_text);
3622
3623int skb_append_pagefrags(struct sk_buff *skb, struct page *page,
3624			 int offset, size_t size)
3625{
3626	int i = skb_shinfo(skb)->nr_frags;
3627
3628	if (skb_can_coalesce(skb, i, page, offset)) {
3629		skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size);
3630	} else if (i < MAX_SKB_FRAGS) {
3631		get_page(page);
3632		skb_fill_page_desc(skb, i, page, offset, size);
3633	} else {
3634		return -EMSGSIZE;
3635	}
3636
3637	return 0;
3638}
3639EXPORT_SYMBOL_GPL(skb_append_pagefrags);
3640
3641/**
3642 *	skb_pull_rcsum - pull skb and update receive checksum
3643 *	@skb: buffer to update
3644 *	@len: length of data pulled
3645 *
3646 *	This function performs an skb_pull on the packet and updates
3647 *	the CHECKSUM_COMPLETE checksum.  It should be used on
3648 *	receive path processing instead of skb_pull unless you know
3649 *	that the checksum difference is zero (e.g., a valid IP header)
3650 *	or you are setting ip_summed to CHECKSUM_NONE.
3651 */
3652void *skb_pull_rcsum(struct sk_buff *skb, unsigned int len)
3653{
3654	unsigned char *data = skb->data;
3655
3656	BUG_ON(len > skb->len);
3657	__skb_pull(skb, len);
3658	skb_postpull_rcsum(skb, data, len);
3659	return skb->data;
3660}
3661EXPORT_SYMBOL_GPL(skb_pull_rcsum);
3662
3663static inline skb_frag_t skb_head_frag_to_page_desc(struct sk_buff *frag_skb)
3664{
3665	skb_frag_t head_frag;
3666	struct page *page;
3667
3668	page = virt_to_head_page(frag_skb->head);
3669	__skb_frag_set_page(&head_frag, page);
3670	skb_frag_off_set(&head_frag, frag_skb->data -
3671			 (unsigned char *)page_address(page));
3672	skb_frag_size_set(&head_frag, skb_headlen(frag_skb));
3673	return head_frag;
3674}
3675
3676struct sk_buff *skb_segment_list(struct sk_buff *skb,
3677				 netdev_features_t features,
3678				 unsigned int offset)
3679{
3680	struct sk_buff *list_skb = skb_shinfo(skb)->frag_list;
3681	unsigned int tnl_hlen = skb_tnl_header_len(skb);
3682	unsigned int delta_truesize = 0;
3683	unsigned int delta_len = 0;
3684	struct sk_buff *tail = NULL;
3685	struct sk_buff *nskb, *tmp;
3686	int err;
3687
3688	skb_push(skb, -skb_network_offset(skb) + offset);
3689
3690	skb_shinfo(skb)->frag_list = NULL;
3691
3692	do {
3693		nskb = list_skb;
3694		list_skb = list_skb->next;
3695
3696		err = 0;
3697		if (skb_shared(nskb)) {
3698			tmp = skb_clone(nskb, GFP_ATOMIC);
3699			if (tmp) {
3700				consume_skb(nskb);
3701				nskb = tmp;
3702				err = skb_unclone(nskb, GFP_ATOMIC);
3703			} else {
3704				err = -ENOMEM;
3705			}
3706		}
3707
3708		if (!tail)
3709			skb->next = nskb;
3710		else
3711			tail->next = nskb;
3712
3713		if (unlikely(err)) {
3714			nskb->next = list_skb;
3715			goto err_linearize;
3716		}
3717
3718		tail = nskb;
3719
3720		delta_len += nskb->len;
3721		delta_truesize += nskb->truesize;
3722
3723		skb_push(nskb, -skb_network_offset(nskb) + offset);
3724
3725		skb_release_head_state(nskb);
3726		 __copy_skb_header(nskb, skb);
3727
3728		skb_headers_offset_update(nskb, skb_headroom(nskb) - skb_headroom(skb));
3729		skb_copy_from_linear_data_offset(skb, -tnl_hlen,
3730						 nskb->data - tnl_hlen,
3731						 offset + tnl_hlen);
3732
3733		if (skb_needs_linearize(nskb, features) &&
3734		    __skb_linearize(nskb))
3735			goto err_linearize;
3736
3737	} while (list_skb);
3738
3739	skb->truesize = skb->truesize - delta_truesize;
3740	skb->data_len = skb->data_len - delta_len;
3741	skb->len = skb->len - delta_len;
3742
3743	skb_gso_reset(skb);
3744
3745	skb->prev = tail;
3746
3747	if (skb_needs_linearize(skb, features) &&
3748	    __skb_linearize(skb))
3749		goto err_linearize;
3750
3751	skb_get(skb);
3752
3753	return skb;
3754
3755err_linearize:
3756	kfree_skb_list(skb->next);
3757	skb->next = NULL;
3758	return ERR_PTR(-ENOMEM);
3759}
3760EXPORT_SYMBOL_GPL(skb_segment_list);
3761
3762int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb)
3763{
3764	if (unlikely(p->len + skb->len >= 65536))
3765		return -E2BIG;
3766
3767	if (NAPI_GRO_CB(p)->last == p)
3768		skb_shinfo(p)->frag_list = skb;
3769	else
3770		NAPI_GRO_CB(p)->last->next = skb;
3771
3772	skb_pull(skb, skb_gro_offset(skb));
3773
3774	NAPI_GRO_CB(p)->last = skb;
3775	NAPI_GRO_CB(p)->count++;
3776	p->data_len += skb->len;
3777	p->truesize += skb->truesize;
3778	p->len += skb->len;
3779
3780	NAPI_GRO_CB(skb)->same_flow = 1;
3781
3782	return 0;
3783}
3784
3785/**
3786 *	skb_segment - Perform protocol segmentation on skb.
3787 *	@head_skb: buffer to segment
3788 *	@features: features for the output path (see dev->features)
3789 *
3790 *	This function performs segmentation on the given skb.  It returns
3791 *	a pointer to the first in a list of new skbs for the segments.
3792 *	In case of error it returns ERR_PTR(err).
3793 */
3794struct sk_buff *skb_segment(struct sk_buff *head_skb,
3795			    netdev_features_t features)
3796{
3797	struct sk_buff *segs = NULL;
3798	struct sk_buff *tail = NULL;
3799	struct sk_buff *list_skb = skb_shinfo(head_skb)->frag_list;
3800	skb_frag_t *frag = skb_shinfo(head_skb)->frags;
3801	unsigned int mss = skb_shinfo(head_skb)->gso_size;
3802	unsigned int doffset = head_skb->data - skb_mac_header(head_skb);
3803	struct sk_buff *frag_skb = head_skb;
3804	unsigned int offset = doffset;
3805	unsigned int tnl_hlen = skb_tnl_header_len(head_skb);
3806	unsigned int partial_segs = 0;
3807	unsigned int headroom;
3808	unsigned int len = head_skb->len;
3809	__be16 proto;
3810	bool csum, sg;
3811	int nfrags = skb_shinfo(head_skb)->nr_frags;
3812	int err = -ENOMEM;
3813	int i = 0;
3814	int pos;
3815
3816	if (list_skb && !list_skb->head_frag && skb_headlen(list_skb) &&
3817	    (skb_shinfo(head_skb)->gso_type & SKB_GSO_DODGY)) {
3818		/* gso_size is untrusted, and we have a frag_list with a linear
3819		 * non head_frag head.
3820		 *
3821		 * (we assume checking the first list_skb member suffices;
3822		 * i.e if either of the list_skb members have non head_frag
3823		 * head, then the first one has too).
3824		 *
3825		 * If head_skb's headlen does not fit requested gso_size, it
3826		 * means that the frag_list members do NOT terminate on exact
3827		 * gso_size boundaries. Hence we cannot perform skb_frag_t page
3828		 * sharing. Therefore we must fallback to copying the frag_list
3829		 * skbs; we do so by disabling SG.
3830		 */
3831		if (mss != GSO_BY_FRAGS && mss != skb_headlen(head_skb))
3832			features &= ~NETIF_F_SG;
3833	}
3834
3835	__skb_push(head_skb, doffset);
3836	proto = skb_network_protocol(head_skb, NULL);
3837	if (unlikely(!proto))
3838		return ERR_PTR(-EINVAL);
3839
3840	sg = !!(features & NETIF_F_SG);
3841	csum = !!can_checksum_protocol(features, proto);
3842
3843	if (sg && csum && (mss != GSO_BY_FRAGS))  {
3844		if (!(features & NETIF_F_GSO_PARTIAL)) {
3845			struct sk_buff *iter;
3846			unsigned int frag_len;
3847
3848			if (!list_skb ||
3849			    !net_gso_ok(features, skb_shinfo(head_skb)->gso_type))
3850				goto normal;
3851
3852			/* If we get here then all the required
3853			 * GSO features except frag_list are supported.
3854			 * Try to split the SKB to multiple GSO SKBs
3855			 * with no frag_list.
3856			 * Currently we can do that only when the buffers don't
3857			 * have a linear part and all the buffers except
3858			 * the last are of the same length.
3859			 */
3860			frag_len = list_skb->len;
3861			skb_walk_frags(head_skb, iter) {
3862				if (frag_len != iter->len && iter->next)
3863					goto normal;
3864				if (skb_headlen(iter) && !iter->head_frag)
3865					goto normal;
3866
3867				len -= iter->len;
3868			}
3869
3870			if (len != frag_len)
3871				goto normal;
3872		}
3873
3874		/* GSO partial only requires that we trim off any excess that
3875		 * doesn't fit into an MSS sized block, so take care of that
3876		 * now.
3877		 */
3878		partial_segs = len / mss;
3879		if (partial_segs > 1)
3880			mss *= partial_segs;
3881		else
3882			partial_segs = 0;
3883	}
3884
3885normal:
3886	headroom = skb_headroom(head_skb);
3887	pos = skb_headlen(head_skb);
3888
3889	do {
3890		struct sk_buff *nskb;
3891		skb_frag_t *nskb_frag;
3892		int hsize;
3893		int size;
3894
3895		if (unlikely(mss == GSO_BY_FRAGS)) {
3896			len = list_skb->len;
3897		} else {
3898			len = head_skb->len - offset;
3899			if (len > mss)
3900				len = mss;
3901		}
3902
3903		hsize = skb_headlen(head_skb) - offset;
3904		if (hsize < 0)
3905			hsize = 0;
3906		if (hsize > len || !sg)
3907			hsize = len;
3908
3909		if (!hsize && i >= nfrags && skb_headlen(list_skb) &&
3910		    (skb_headlen(list_skb) == len || sg)) {
3911			BUG_ON(skb_headlen(list_skb) > len);
3912
3913			i = 0;
3914			nfrags = skb_shinfo(list_skb)->nr_frags;
3915			frag = skb_shinfo(list_skb)->frags;
3916			frag_skb = list_skb;
3917			pos += skb_headlen(list_skb);
3918
3919			while (pos < offset + len) {
3920				BUG_ON(i >= nfrags);
3921
3922				size = skb_frag_size(frag);
3923				if (pos + size > offset + len)
3924					break;
3925
3926				i++;
3927				pos += size;
3928				frag++;
3929			}
3930
3931			nskb = skb_clone(list_skb, GFP_ATOMIC);
3932			list_skb = list_skb->next;
3933
3934			if (unlikely(!nskb))
3935				goto err;
3936
3937			if (unlikely(pskb_trim(nskb, len))) {
3938				kfree_skb(nskb);
3939				goto err;
3940			}
3941
3942			hsize = skb_end_offset(nskb);
3943			if (skb_cow_head(nskb, doffset + headroom)) {
3944				kfree_skb(nskb);
3945				goto err;
3946			}
3947
3948			nskb->truesize += skb_end_offset(nskb) - hsize;
3949			skb_release_head_state(nskb);
3950			__skb_push(nskb, doffset);
3951		} else {
3952			nskb = __alloc_skb(hsize + doffset + headroom,
3953					   GFP_ATOMIC, skb_alloc_rx_flag(head_skb),
3954					   NUMA_NO_NODE);
3955
3956			if (unlikely(!nskb))
3957				goto err;
3958
3959			skb_reserve(nskb, headroom);
3960			__skb_put(nskb, doffset);
3961		}
3962
3963		if (segs)
3964			tail->next = nskb;
3965		else
3966			segs = nskb;
3967		tail = nskb;
3968
3969		__copy_skb_header(nskb, head_skb);
3970
3971		skb_headers_offset_update(nskb, skb_headroom(nskb) - headroom);
3972		skb_reset_mac_len(nskb);
3973
3974		skb_copy_from_linear_data_offset(head_skb, -tnl_hlen,
3975						 nskb->data - tnl_hlen,
3976						 doffset + tnl_hlen);
3977
3978		if (nskb->len == len + doffset)
3979			goto perform_csum_check;
3980
3981		if (!sg) {
3982			if (!csum) {
3983				if (!nskb->remcsum_offload)
3984					nskb->ip_summed = CHECKSUM_NONE;
3985				SKB_GSO_CB(nskb)->csum =
3986					skb_copy_and_csum_bits(head_skb, offset,
3987							       skb_put(nskb,
3988								       len),
3989							       len);
3990				SKB_GSO_CB(nskb)->csum_start =
3991					skb_headroom(nskb) + doffset;
3992			} else {
3993				skb_copy_bits(head_skb, offset,
3994					      skb_put(nskb, len),
3995					      len);
3996			}
3997			continue;
3998		}
3999
4000		nskb_frag = skb_shinfo(nskb)->frags;
4001
4002		skb_copy_from_linear_data_offset(head_skb, offset,
4003						 skb_put(nskb, hsize), hsize);
4004
4005		skb_shinfo(nskb)->tx_flags |= skb_shinfo(head_skb)->tx_flags &
4006					      SKBTX_SHARED_FRAG;
4007
4008		if (skb_orphan_frags(frag_skb, GFP_ATOMIC) ||
4009		    skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC))
4010			goto err;
4011
4012		while (pos < offset + len) {
4013			if (i >= nfrags) {
4014				i = 0;
4015				nfrags = skb_shinfo(list_skb)->nr_frags;
4016				frag = skb_shinfo(list_skb)->frags;
4017				frag_skb = list_skb;
4018				if (!skb_headlen(list_skb)) {
4019					BUG_ON(!nfrags);
4020				} else {
4021					BUG_ON(!list_skb->head_frag);
4022
4023					/* to make room for head_frag. */
4024					i--;
4025					frag--;
4026				}
4027				if (skb_orphan_frags(frag_skb, GFP_ATOMIC) ||
4028				    skb_zerocopy_clone(nskb, frag_skb,
4029						       GFP_ATOMIC))
4030					goto err;
4031
4032				list_skb = list_skb->next;
4033			}
4034
4035			if (unlikely(skb_shinfo(nskb)->nr_frags >=
4036				     MAX_SKB_FRAGS)) {
4037				net_warn_ratelimited(
4038					"skb_segment: too many frags: %u %u\n",
4039					pos, mss);
4040				err = -EINVAL;
4041				goto err;
4042			}
4043
4044			*nskb_frag = (i < 0) ? skb_head_frag_to_page_desc(frag_skb) : *frag;
4045			__skb_frag_ref(nskb_frag);
4046			size = skb_frag_size(nskb_frag);
4047
4048			if (pos < offset) {
4049				skb_frag_off_add(nskb_frag, offset - pos);
4050				skb_frag_size_sub(nskb_frag, offset - pos);
4051			}
4052
4053			skb_shinfo(nskb)->nr_frags++;
4054
4055			if (pos + size <= offset + len) {
4056				i++;
4057				frag++;
4058				pos += size;
4059			} else {
4060				skb_frag_size_sub(nskb_frag, pos + size - (offset + len));
4061				goto skip_fraglist;
4062			}
4063
4064			nskb_frag++;
4065		}
4066
4067skip_fraglist:
4068		nskb->data_len = len - hsize;
4069		nskb->len += nskb->data_len;
4070		nskb->truesize += nskb->data_len;
4071
4072perform_csum_check:
4073		if (!csum) {
4074			if (skb_has_shared_frag(nskb) &&
4075			    __skb_linearize(nskb))
4076				goto err;
4077
4078			if (!nskb->remcsum_offload)
4079				nskb->ip_summed = CHECKSUM_NONE;
4080			SKB_GSO_CB(nskb)->csum =
4081				skb_checksum(nskb, doffset,
4082					     nskb->len - doffset, 0);
4083			SKB_GSO_CB(nskb)->csum_start =
4084				skb_headroom(nskb) + doffset;
4085		}
4086	} while ((offset += len) < head_skb->len);
4087
4088	/* Some callers want to get the end of the list.
4089	 * Put it in segs->prev to avoid walking the list.
4090	 * (see validate_xmit_skb_list() for example)
4091	 */
4092	segs->prev = tail;
4093
4094	if (partial_segs) {
4095		struct sk_buff *iter;
4096		int type = skb_shinfo(head_skb)->gso_type;
4097		unsigned short gso_size = skb_shinfo(head_skb)->gso_size;
4098
4099		/* Update type to add partial and then remove dodgy if set */
4100		type |= (features & NETIF_F_GSO_PARTIAL) / NETIF_F_GSO_PARTIAL * SKB_GSO_PARTIAL;
4101		type &= ~SKB_GSO_DODGY;
4102
4103		/* Update GSO info and prepare to start updating headers on
4104		 * our way back down the stack of protocols.
4105		 */
4106		for (iter = segs; iter; iter = iter->next) {
4107			skb_shinfo(iter)->gso_size = gso_size;
4108			skb_shinfo(iter)->gso_segs = partial_segs;
4109			skb_shinfo(iter)->gso_type = type;
4110			SKB_GSO_CB(iter)->data_offset = skb_headroom(iter) + doffset;
4111		}
4112
4113		if (tail->len - doffset <= gso_size)
4114			skb_shinfo(tail)->gso_size = 0;
4115		else if (tail != segs)
4116			skb_shinfo(tail)->gso_segs = DIV_ROUND_UP(tail->len - doffset, gso_size);
4117	}
4118
4119	/* Following permits correct backpressure, for protocols
4120	 * using skb_set_owner_w().
4121	 * Idea is to tranfert ownership from head_skb to last segment.
4122	 */
4123	if (head_skb->destructor == sock_wfree) {
4124		swap(tail->truesize, head_skb->truesize);
4125		swap(tail->destructor, head_skb->destructor);
4126		swap(tail->sk, head_skb->sk);
4127	}
4128	return segs;
4129
4130err:
4131	kfree_skb_list(segs);
4132	return ERR_PTR(err);
4133}
4134EXPORT_SYMBOL_GPL(skb_segment);
4135
4136int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
4137{
4138	struct skb_shared_info *pinfo, *skbinfo = skb_shinfo(skb);
4139	unsigned int offset = skb_gro_offset(skb);
4140	unsigned int headlen = skb_headlen(skb);
4141	unsigned int len = skb_gro_len(skb);
4142	unsigned int delta_truesize;
4143	struct sk_buff *lp;
4144
4145	if (unlikely(p->len + len >= 65536 || NAPI_GRO_CB(skb)->flush))
4146		return -E2BIG;
4147
4148	lp = NAPI_GRO_CB(p)->last;
4149	pinfo = skb_shinfo(lp);
4150
4151	if (headlen <= offset) {
4152		skb_frag_t *frag;
4153		skb_frag_t *frag2;
4154		int i = skbinfo->nr_frags;
4155		int nr_frags = pinfo->nr_frags + i;
4156
4157		if (nr_frags > MAX_SKB_FRAGS)
4158			goto merge;
4159
4160		offset -= headlen;
4161		pinfo->nr_frags = nr_frags;
4162		skbinfo->nr_frags = 0;
4163
4164		frag = pinfo->frags + nr_frags;
4165		frag2 = skbinfo->frags + i;
4166		do {
4167			*--frag = *--frag2;
4168		} while (--i);
4169
4170		skb_frag_off_add(frag, offset);
4171		skb_frag_size_sub(frag, offset);
4172
4173		/* all fragments truesize : remove (head size + sk_buff) */
4174		delta_truesize = skb->truesize -
4175				 SKB_TRUESIZE(skb_end_offset(skb));
4176
4177		skb->truesize -= skb->data_len;
4178		skb->len -= skb->data_len;
4179		skb->data_len = 0;
4180
4181		NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE;
4182		goto done;
4183	} else if (skb->head_frag) {
4184		int nr_frags = pinfo->nr_frags;
4185		skb_frag_t *frag = pinfo->frags + nr_frags;
4186		struct page *page = virt_to_head_page(skb->head);
4187		unsigned int first_size = headlen - offset;
4188		unsigned int first_offset;
4189
4190		if (nr_frags + 1 + skbinfo->nr_frags > MAX_SKB_FRAGS)
4191			goto merge;
4192
4193		first_offset = skb->data -
4194			       (unsigned char *)page_address(page) +
4195			       offset;
4196
4197		pinfo->nr_frags = nr_frags + 1 + skbinfo->nr_frags;
4198
4199		__skb_frag_set_page(frag, page);
4200		skb_frag_off_set(frag, first_offset);
4201		skb_frag_size_set(frag, first_size);
4202
4203		memcpy(frag + 1, skbinfo->frags, sizeof(*frag) * skbinfo->nr_frags);
4204		/* We dont need to clear skbinfo->nr_frags here */
4205
4206		delta_truesize = skb->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff));
4207		NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD;
4208		goto done;
4209	}
4210
4211merge:
4212	delta_truesize = skb->truesize;
4213	if (offset > headlen) {
4214		unsigned int eat = offset - headlen;
4215
4216		skb_frag_off_add(&skbinfo->frags[0], eat);
4217		skb_frag_size_sub(&skbinfo->frags[0], eat);
4218		skb->data_len -= eat;
4219		skb->len -= eat;
4220		offset = headlen;
4221	}
4222
4223	__skb_pull(skb, offset);
4224
4225	if (NAPI_GRO_CB(p)->last == p)
4226		skb_shinfo(p)->frag_list = skb;
4227	else
4228		NAPI_GRO_CB(p)->last->next = skb;
4229	NAPI_GRO_CB(p)->last = skb;
4230	__skb_header_release(skb);
4231	lp = p;
4232
4233done:
4234	NAPI_GRO_CB(p)->count++;
4235	p->data_len += len;
4236	p->truesize += delta_truesize;
4237	p->len += len;
4238	if (lp != p) {
4239		lp->data_len += len;
4240		lp->truesize += delta_truesize;
4241		lp->len += len;
4242	}
4243	NAPI_GRO_CB(skb)->same_flow = 1;
4244	return 0;
4245}
4246
4247#ifdef CONFIG_SKB_EXTENSIONS
4248#define SKB_EXT_ALIGN_VALUE	8
4249#define SKB_EXT_CHUNKSIZEOF(x)	(ALIGN((sizeof(x)), SKB_EXT_ALIGN_VALUE) / SKB_EXT_ALIGN_VALUE)
4250
4251static const u8 skb_ext_type_len[] = {
4252#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
4253	[SKB_EXT_BRIDGE_NF] = SKB_EXT_CHUNKSIZEOF(struct nf_bridge_info),
4254#endif
4255#ifdef CONFIG_XFRM
4256	[SKB_EXT_SEC_PATH] = SKB_EXT_CHUNKSIZEOF(struct sec_path),
4257#endif
4258#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
4259	[TC_SKB_EXT] = SKB_EXT_CHUNKSIZEOF(struct tc_skb_ext),
4260#endif
4261#if IS_ENABLED(CONFIG_MPTCP)
4262	[SKB_EXT_MPTCP] = SKB_EXT_CHUNKSIZEOF(struct mptcp_ext),
4263#endif
4264};
4265
4266static __always_inline unsigned int skb_ext_total_length(void)
4267{
4268	return SKB_EXT_CHUNKSIZEOF(struct skb_ext) +
4269#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
4270		skb_ext_type_len[SKB_EXT_BRIDGE_NF] +
4271#endif
4272#ifdef CONFIG_XFRM
4273		skb_ext_type_len[SKB_EXT_SEC_PATH] +
4274#endif
4275#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
4276		skb_ext_type_len[TC_SKB_EXT] +
4277#endif
4278#if IS_ENABLED(CONFIG_MPTCP)
4279		skb_ext_type_len[SKB_EXT_MPTCP] +
4280#endif
4281		0;
4282}
4283
4284static void skb_extensions_init(void)
4285{
4286	BUILD_BUG_ON(SKB_EXT_NUM >= 8);
4287	BUILD_BUG_ON(skb_ext_total_length() > 255);
4288
4289	skbuff_ext_cache = kmem_cache_create("skbuff_ext_cache",
4290					     SKB_EXT_ALIGN_VALUE * skb_ext_total_length(),
4291					     0,
4292					     SLAB_HWCACHE_ALIGN|SLAB_PANIC,
4293					     NULL);
4294}
4295#else
4296static void skb_extensions_init(void) {}
4297#endif
4298
4299void __init skb_init(void)
4300{
4301	skbuff_head_cache = kmem_cache_create_usercopy("skbuff_head_cache",
4302					      sizeof(struct sk_buff),
4303					      0,
4304					      SLAB_HWCACHE_ALIGN|SLAB_PANIC,
4305					      offsetof(struct sk_buff, cb),
4306					      sizeof_field(struct sk_buff, cb),
4307					      NULL);
4308	skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache",
4309						sizeof(struct sk_buff_fclones),
4310						0,
4311						SLAB_HWCACHE_ALIGN|SLAB_PANIC,
4312						NULL);
4313	skb_extensions_init();
4314}
4315
4316static int
4317__skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len,
4318	       unsigned int recursion_level)
4319{
4320	int start = skb_headlen(skb);
4321	int i, copy = start - offset;
4322	struct sk_buff *frag_iter;
4323	int elt = 0;
4324
4325	if (unlikely(recursion_level >= 24))
4326		return -EMSGSIZE;
4327
4328	if (copy > 0) {
4329		if (copy > len)
4330			copy = len;
4331		sg_set_buf(sg, skb->data + offset, copy);
4332		elt++;
4333		if ((len -= copy) == 0)
4334			return elt;
4335		offset += copy;
4336	}
4337
4338	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
4339		int end;
4340
4341		WARN_ON(start > offset + len);
4342
4343		end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]);
4344		if ((copy = end - offset) > 0) {
4345			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
4346			if (unlikely(elt && sg_is_last(&sg[elt - 1])))
4347				return -EMSGSIZE;
4348
4349			if (copy > len)
4350				copy = len;
4351			sg_set_page(&sg[elt], skb_frag_page(frag), copy,
4352				    skb_frag_off(frag) + offset - start);
4353			elt++;
4354			if (!(len -= copy))
4355				return elt;
4356			offset += copy;
4357		}
4358		start = end;
4359	}
4360
4361	skb_walk_frags(skb, frag_iter) {
4362		int end, ret;
4363
4364		WARN_ON(start > offset + len);
4365
4366		end = start + frag_iter->len;
4367		if ((copy = end - offset) > 0) {
4368			if (unlikely(elt && sg_is_last(&sg[elt - 1])))
4369				return -EMSGSIZE;
4370
4371			if (copy > len)
4372				copy = len;
4373			ret = __skb_to_sgvec(frag_iter, sg+elt, offset - start,
4374					      copy, recursion_level + 1);
4375			if (unlikely(ret < 0))
4376				return ret;
4377			elt += ret;
4378			if ((len -= copy) == 0)
4379				return elt;
4380			offset += copy;
4381		}
4382		start = end;
4383	}
4384	BUG_ON(len);
4385	return elt;
4386}
4387
4388/**
4389 *	skb_to_sgvec - Fill a scatter-gather list from a socket buffer
4390 *	@skb: Socket buffer containing the buffers to be mapped
4391 *	@sg: The scatter-gather list to map into
4392 *	@offset: The offset into the buffer's contents to start mapping
4393 *	@len: Length of buffer space to be mapped
4394 *
4395 *	Fill the specified scatter-gather list with mappings/pointers into a
4396 *	region of the buffer space attached to a socket buffer. Returns either
4397 *	the number of scatterlist items used, or -EMSGSIZE if the contents
4398 *	could not fit.
4399 */
4400int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)
4401{
4402	int nsg = __skb_to_sgvec(skb, sg, offset, len, 0);
4403
4404	if (nsg <= 0)
4405		return nsg;
4406
4407	sg_mark_end(&sg[nsg - 1]);
4408
4409	return nsg;
4410}
4411EXPORT_SYMBOL_GPL(skb_to_sgvec);
4412
4413/* As compared with skb_to_sgvec, skb_to_sgvec_nomark only map skb to given
4414 * sglist without mark the sg which contain last skb data as the end.
4415 * So the caller can mannipulate sg list as will when padding new data after
4416 * the first call without calling sg_unmark_end to expend sg list.
4417 *
4418 * Scenario to use skb_to_sgvec_nomark:
4419 * 1. sg_init_table
4420 * 2. skb_to_sgvec_nomark(payload1)
4421 * 3. skb_to_sgvec_nomark(payload2)
4422 *
4423 * This is equivalent to:
4424 * 1. sg_init_table
4425 * 2. skb_to_sgvec(payload1)
4426 * 3. sg_unmark_end
4427 * 4. skb_to_sgvec(payload2)
4428 *
4429 * When mapping mutilple payload conditionally, skb_to_sgvec_nomark
4430 * is more preferable.
4431 */
4432int skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg,
4433			int offset, int len)
4434{
4435	return __skb_to_sgvec(skb, sg, offset, len, 0);
4436}
4437EXPORT_SYMBOL_GPL(skb_to_sgvec_nomark);
4438
4439
4440
4441/**
4442 *	skb_cow_data - Check that a socket buffer's data buffers are writable
4443 *	@skb: The socket buffer to check.
4444 *	@tailbits: Amount of trailing space to be added
4445 *	@trailer: Returned pointer to the skb where the @tailbits space begins
4446 *
4447 *	Make sure that the data buffers attached to a socket buffer are
4448 *	writable. If they are not, private copies are made of the data buffers
4449 *	and the socket buffer is set to use these instead.
4450 *
4451 *	If @tailbits is given, make sure that there is space to write @tailbits
4452 *	bytes of data beyond current end of socket buffer.  @trailer will be
4453 *	set to point to the skb in which this space begins.
4454 *
4455 *	The number of scatterlist elements required to completely map the
4456 *	COW'd and extended socket buffer will be returned.
4457 */
4458int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer)
4459{
4460	int copyflag;
4461	int elt;
4462	struct sk_buff *skb1, **skb_p;
4463
4464	/* If skb is cloned or its head is paged, reallocate
4465	 * head pulling out all the pages (pages are considered not writable
4466	 * at the moment even if they are anonymous).
4467	 */
4468	if ((skb_cloned(skb) || skb_shinfo(skb)->nr_frags) &&
4469	    !__pskb_pull_tail(skb, __skb_pagelen(skb)))
4470		return -ENOMEM;
4471
4472	/* Easy case. Most of packets will go this way. */
4473	if (!skb_has_frag_list(skb)) {
4474		/* A little of trouble, not enough of space for trailer.
4475		 * This should not happen, when stack is tuned to generate
4476		 * good frames. OK, on miss we reallocate and reserve even more
4477		 * space, 128 bytes is fair. */
4478
4479		if (skb_tailroom(skb) < tailbits &&
4480		    pskb_expand_head(skb, 0, tailbits-skb_tailroom(skb)+128, GFP_ATOMIC))
4481			return -ENOMEM;
4482
4483		/* Voila! */
4484		*trailer = skb;
4485		return 1;
4486	}
4487
4488	/* Misery. We are in troubles, going to mincer fragments... */
4489
4490	elt = 1;
4491	skb_p = &skb_shinfo(skb)->frag_list;
4492	copyflag = 0;
4493
4494	while ((skb1 = *skb_p) != NULL) {
4495		int ntail = 0;
4496
4497		/* The fragment is partially pulled by someone,
4498		 * this can happen on input. Copy it and everything
4499		 * after it. */
4500
4501		if (skb_shared(skb1))
4502			copyflag = 1;
4503
4504		/* If the skb is the last, worry about trailer. */
4505
4506		if (skb1->next == NULL && tailbits) {
4507			if (skb_shinfo(skb1)->nr_frags ||
4508			    skb_has_frag_list(skb1) ||
4509			    skb_tailroom(skb1) < tailbits)
4510				ntail = tailbits + 128;
4511		}
4512
4513		if (copyflag ||
4514		    skb_cloned(skb1) ||
4515		    ntail ||
4516		    skb_shinfo(skb1)->nr_frags ||
4517		    skb_has_frag_list(skb1)) {
4518			struct sk_buff *skb2;
4519
4520			/* Fuck, we are miserable poor guys... */
4521			if (ntail == 0)
4522				skb2 = skb_copy(skb1, GFP_ATOMIC);
4523			else
4524				skb2 = skb_copy_expand(skb1,
4525						       skb_headroom(skb1),
4526						       ntail,
4527						       GFP_ATOMIC);
4528			if (unlikely(skb2 == NULL))
4529				return -ENOMEM;
4530
4531			if (skb1->sk)
4532				skb_set_owner_w(skb2, skb1->sk);
4533
4534			/* Looking around. Are we still alive?
4535			 * OK, link new skb, drop old one */
4536
4537			skb2->next = skb1->next;
4538			*skb_p = skb2;
4539			kfree_skb(skb1);
4540			skb1 = skb2;
4541		}
4542		elt++;
4543		*trailer = skb1;
4544		skb_p = &skb1->next;
4545	}
4546
4547	return elt;
4548}
4549EXPORT_SYMBOL_GPL(skb_cow_data);
4550
4551static void sock_rmem_free(struct sk_buff *skb)
4552{
4553	struct sock *sk = skb->sk;
4554
4555	atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
4556}
4557
4558static void skb_set_err_queue(struct sk_buff *skb)
4559{
4560	/* pkt_type of skbs received on local sockets is never PACKET_OUTGOING.
4561	 * So, it is safe to (mis)use it to mark skbs on the error queue.
4562	 */
4563	skb->pkt_type = PACKET_OUTGOING;
4564	BUILD_BUG_ON(PACKET_OUTGOING == 0);
4565}
4566
4567/*
4568 * Note: We dont mem charge error packets (no sk_forward_alloc changes)
4569 */
4570int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
4571{
4572	if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
4573	    (unsigned int)READ_ONCE(sk->sk_rcvbuf))
4574		return -ENOMEM;
4575
4576	skb_orphan(skb);
4577	skb->sk = sk;
4578	skb->destructor = sock_rmem_free;
4579	atomic_add(skb->truesize, &sk->sk_rmem_alloc);
4580	skb_set_err_queue(skb);
4581
4582	/* before exiting rcu section, make sure dst is refcounted */
4583	skb_dst_force(skb);
4584
4585	skb_queue_tail(&sk->sk_error_queue, skb);
4586	if (!sock_flag(sk, SOCK_DEAD))
4587		sk->sk_error_report(sk);
4588	return 0;
4589}
4590EXPORT_SYMBOL(sock_queue_err_skb);
4591
4592static bool is_icmp_err_skb(const struct sk_buff *skb)
4593{
4594	return skb && (SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP ||
4595		       SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP6);
4596}
4597
4598struct sk_buff *sock_dequeue_err_skb(struct sock *sk)
4599{
4600	struct sk_buff_head *q = &sk->sk_error_queue;
4601	struct sk_buff *skb, *skb_next = NULL;
4602	bool icmp_next = false;
4603	unsigned long flags;
4604
4605	spin_lock_irqsave(&q->lock, flags);
4606	skb = __skb_dequeue(q);
4607	if (skb && (skb_next = skb_peek(q))) {
4608		icmp_next = is_icmp_err_skb(skb_next);
4609		if (icmp_next)
4610			sk->sk_err = SKB_EXT_ERR(skb_next)->ee.ee_errno;
4611	}
4612	spin_unlock_irqrestore(&q->lock, flags);
4613
4614	if (is_icmp_err_skb(skb) && !icmp_next)
4615		sk->sk_err = 0;
4616
4617	if (skb_next)
4618		sk->sk_error_report(sk);
4619
4620	return skb;
4621}
4622EXPORT_SYMBOL(sock_dequeue_err_skb);
4623
4624/**
4625 * skb_clone_sk - create clone of skb, and take reference to socket
4626 * @skb: the skb to clone
4627 *
4628 * This function creates a clone of a buffer that holds a reference on
4629 * sk_refcnt.  Buffers created via this function are meant to be
4630 * returned using sock_queue_err_skb, or free via kfree_skb.
4631 *
4632 * When passing buffers allocated with this function to sock_queue_err_skb
4633 * it is necessary to wrap the call with sock_hold/sock_put in order to
4634 * prevent the socket from being released prior to being enqueued on
4635 * the sk_error_queue.
4636 */
4637struct sk_buff *skb_clone_sk(struct sk_buff *skb)
4638{
4639	struct sock *sk = skb->sk;
4640	struct sk_buff *clone;
4641
4642	if (!sk || !refcount_inc_not_zero(&sk->sk_refcnt))
4643		return NULL;
4644
4645	clone = skb_clone(skb, GFP_ATOMIC);
4646	if (!clone) {
4647		sock_put(sk);
4648		return NULL;
4649	}
4650
4651	clone->sk = sk;
4652	clone->destructor = sock_efree;
4653
4654	return clone;
4655}
4656EXPORT_SYMBOL(skb_clone_sk);
4657
4658static void __skb_complete_tx_timestamp(struct sk_buff *skb,
4659					struct sock *sk,
4660					int tstype,
4661					bool opt_stats)
4662{
4663	struct sock_exterr_skb *serr;
4664	int err;
4665
4666	BUILD_BUG_ON(sizeof(struct sock_exterr_skb) > sizeof(skb->cb));
4667
4668	serr = SKB_EXT_ERR(skb);
4669	memset(serr, 0, sizeof(*serr));
4670	serr->ee.ee_errno = ENOMSG;
4671	serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING;
4672	serr->ee.ee_info = tstype;
4673	serr->opt_stats = opt_stats;
4674	serr->header.h4.iif = skb->dev ? skb->dev->ifindex : 0;
4675	if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) {
4676		serr->ee.ee_data = skb_shinfo(skb)->tskey;
4677		if (sk->sk_protocol == IPPROTO_TCP &&
4678		    sk->sk_type == SOCK_STREAM)
4679			serr->ee.ee_data -= sk->sk_tskey;
4680	}
4681
4682	err = sock_queue_err_skb(sk, skb);
4683
4684	if (err)
4685		kfree_skb(skb);
4686}
4687
4688static bool skb_may_tx_timestamp(struct sock *sk, bool tsonly)
4689{
4690	bool ret;
4691
4692	if (likely(sysctl_tstamp_allow_data || tsonly))
4693		return true;
4694
4695	read_lock_bh(&sk->sk_callback_lock);
4696	ret = sk->sk_socket && sk->sk_socket->file &&
4697	      file_ns_capable(sk->sk_socket->file, &init_user_ns, CAP_NET_RAW);
4698	read_unlock_bh(&sk->sk_callback_lock);
4699	return ret;
4700}
4701
4702void skb_complete_tx_timestamp(struct sk_buff *skb,
4703			       struct skb_shared_hwtstamps *hwtstamps)
4704{
4705	struct sock *sk = skb->sk;
4706
4707	if (!skb_may_tx_timestamp(sk, false))
4708		goto err;
4709
4710	/* Take a reference to prevent skb_orphan() from freeing the socket,
4711	 * but only if the socket refcount is not zero.
4712	 */
4713	if (likely(refcount_inc_not_zero(&sk->sk_refcnt))) {
4714		*skb_hwtstamps(skb) = *hwtstamps;
4715		__skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND, false);
4716		sock_put(sk);
4717		return;
4718	}
4719
4720err:
4721	kfree_skb(skb);
4722}
4723EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
4724
4725void __skb_tstamp_tx(struct sk_buff *orig_skb,
4726		     struct skb_shared_hwtstamps *hwtstamps,
4727		     struct sock *sk, int tstype)
4728{
4729	struct sk_buff *skb;
4730	bool tsonly, opt_stats = false;
4731
4732	if (!sk)
4733		return;
4734
4735	if (!hwtstamps && !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) &&
4736	    skb_shinfo(orig_skb)->tx_flags & SKBTX_IN_PROGRESS)
4737		return;
4738
4739	tsonly = sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TSONLY;
4740	if (!skb_may_tx_timestamp(sk, tsonly))
4741		return;
4742
4743	if (tsonly) {
4744#ifdef CONFIG_INET
4745		if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS) &&
4746		    sk->sk_protocol == IPPROTO_TCP &&
4747		    sk->sk_type == SOCK_STREAM) {
4748			skb = tcp_get_timestamping_opt_stats(sk, orig_skb);
4749			opt_stats = true;
4750		} else
4751#endif
4752			skb = alloc_skb(0, GFP_ATOMIC);
4753	} else {
4754		skb = skb_clone(orig_skb, GFP_ATOMIC);
4755	}
4756	if (!skb)
4757		return;
4758
4759	if (tsonly) {
4760		skb_shinfo(skb)->tx_flags |= skb_shinfo(orig_skb)->tx_flags &
4761					     SKBTX_ANY_TSTAMP;
4762		skb_shinfo(skb)->tskey = skb_shinfo(orig_skb)->tskey;
4763	}
4764
4765	if (hwtstamps)
4766		*skb_hwtstamps(skb) = *hwtstamps;
4767	else
4768		skb->tstamp = ktime_get_real();
4769
4770	__skb_complete_tx_timestamp(skb, sk, tstype, opt_stats);
4771}
4772EXPORT_SYMBOL_GPL(__skb_tstamp_tx);
4773
4774void skb_tstamp_tx(struct sk_buff *orig_skb,
4775		   struct skb_shared_hwtstamps *hwtstamps)
4776{
4777	return __skb_tstamp_tx(orig_skb, hwtstamps, orig_skb->sk,
4778			       SCM_TSTAMP_SND);
4779}
4780EXPORT_SYMBOL_GPL(skb_tstamp_tx);
4781
4782void skb_complete_wifi_ack(struct sk_buff *skb, bool acked)
4783{
4784	struct sock *sk = skb->sk;
4785	struct sock_exterr_skb *serr;
4786	int err = 1;
4787
4788	skb->wifi_acked_valid = 1;
4789	skb->wifi_acked = acked;
4790
4791	serr = SKB_EXT_ERR(skb);
4792	memset(serr, 0, sizeof(*serr));
4793	serr->ee.ee_errno = ENOMSG;
4794	serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS;
4795
4796	/* Take a reference to prevent skb_orphan() from freeing the socket,
4797	 * but only if the socket refcount is not zero.
4798	 */
4799	if (likely(refcount_inc_not_zero(&sk->sk_refcnt))) {
4800		err = sock_queue_err_skb(sk, skb);
4801		sock_put(sk);
4802	}
4803	if (err)
4804		kfree_skb(skb);
4805}
4806EXPORT_SYMBOL_GPL(skb_complete_wifi_ack);
4807
4808/**
4809 * skb_partial_csum_set - set up and verify partial csum values for packet
4810 * @skb: the skb to set
4811 * @start: the number of bytes after skb->data to start checksumming.
4812 * @off: the offset from start to place the checksum.
4813 *
4814 * For untrusted partially-checksummed packets, we need to make sure the values
4815 * for skb->csum_start and skb->csum_offset are valid so we don't oops.
4816 *
4817 * This function checks and sets those values and skb->ip_summed: if this
4818 * returns false you should drop the packet.
4819 */
4820bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off)
4821{
4822	u32 csum_end = (u32)start + (u32)off + sizeof(__sum16);
4823	u32 csum_start = skb_headroom(skb) + (u32)start;
4824
4825	if (unlikely(csum_start > U16_MAX || csum_end > skb_headlen(skb))) {
4826		net_warn_ratelimited("bad partial csum: csum=%u/%u headroom=%u headlen=%u\n",
4827				     start, off, skb_headroom(skb), skb_headlen(skb));
4828		return false;
4829	}
4830	skb->ip_summed = CHECKSUM_PARTIAL;
4831	skb->csum_start = csum_start;
4832	skb->csum_offset = off;
4833	skb_set_transport_header(skb, start);
4834	return true;
4835}
4836EXPORT_SYMBOL_GPL(skb_partial_csum_set);
4837
4838static int skb_maybe_pull_tail(struct sk_buff *skb, unsigned int len,
4839			       unsigned int max)
4840{
4841	if (skb_headlen(skb) >= len)
4842		return 0;
4843
4844	/* If we need to pullup then pullup to the max, so we
4845	 * won't need to do it again.
4846	 */
4847	if (max > skb->len)
4848		max = skb->len;
4849
4850	if (__pskb_pull_tail(skb, max - skb_headlen(skb)) == NULL)
4851		return -ENOMEM;
4852
4853	if (skb_headlen(skb) < len)
4854		return -EPROTO;
4855
4856	return 0;
4857}
4858
4859#define MAX_TCP_HDR_LEN (15 * 4)
4860
4861static __sum16 *skb_checksum_setup_ip(struct sk_buff *skb,
4862				      typeof(IPPROTO_IP) proto,
4863				      unsigned int off)
4864{
4865	int err;
4866
4867	switch (proto) {
4868	case IPPROTO_TCP:
4869		err = skb_maybe_pull_tail(skb, off + sizeof(struct tcphdr),
4870					  off + MAX_TCP_HDR_LEN);
4871		if (!err && !skb_partial_csum_set(skb, off,
4872						  offsetof(struct tcphdr,
4873							   check)))
4874			err = -EPROTO;
4875		return err ? ERR_PTR(err) : &tcp_hdr(skb)->check;
4876
4877	case IPPROTO_UDP:
4878		err = skb_maybe_pull_tail(skb, off + sizeof(struct udphdr),
4879					  off + sizeof(struct udphdr));
4880		if (!err && !skb_partial_csum_set(skb, off,
4881						  offsetof(struct udphdr,
4882							   check)))
4883			err = -EPROTO;
4884		return err ? ERR_PTR(err) : &udp_hdr(skb)->check;
4885	}
4886
4887	return ERR_PTR(-EPROTO);
4888}
4889
4890/* This value should be large enough to cover a tagged ethernet header plus
4891 * maximally sized IP and TCP or UDP headers.
4892 */
4893#define MAX_IP_HDR_LEN 128
4894
4895static int skb_checksum_setup_ipv4(struct sk_buff *skb, bool recalculate)
4896{
4897	unsigned int off;
4898	bool fragment;
4899	__sum16 *csum;
4900	int err;
4901
4902	fragment = false;
4903
4904	err = skb_maybe_pull_tail(skb,
4905				  sizeof(struct iphdr),
4906				  MAX_IP_HDR_LEN);
4907	if (err < 0)
4908		goto out;
4909
4910	if (ip_is_fragment(ip_hdr(skb)))
4911		fragment = true;
4912
4913	off = ip_hdrlen(skb);
4914
4915	err = -EPROTO;
4916
4917	if (fragment)
4918		goto out;
4919
4920	csum = skb_checksum_setup_ip(skb, ip_hdr(skb)->protocol, off);
4921	if (IS_ERR(csum))
4922		return PTR_ERR(csum);
4923
4924	if (recalculate)
4925		*csum = ~csum_tcpudp_magic(ip_hdr(skb)->saddr,
4926					   ip_hdr(skb)->daddr,
4927					   skb->len - off,
4928					   ip_hdr(skb)->protocol, 0);
4929	err = 0;
4930
4931out:
4932	return err;
4933}
4934
4935/* This value should be large enough to cover a tagged ethernet header plus
4936 * an IPv6 header, all options, and a maximal TCP or UDP header.
4937 */
4938#define MAX_IPV6_HDR_LEN 256
4939
4940#define OPT_HDR(type, skb, off) \
4941	(type *)(skb_network_header(skb) + (off))
4942
4943static int skb_checksum_setup_ipv6(struct sk_buff *skb, bool recalculate)
4944{
4945	int err;
4946	u8 nexthdr;
4947	unsigned int off;
4948	unsigned int len;
4949	bool fragment;
4950	bool done;
4951	__sum16 *csum;
4952
4953	fragment = false;
4954	done = false;
4955
4956	off = sizeof(struct ipv6hdr);
4957
4958	err = skb_maybe_pull_tail(skb, off, MAX_IPV6_HDR_LEN);
4959	if (err < 0)
4960		goto out;
4961
4962	nexthdr = ipv6_hdr(skb)->nexthdr;
4963
4964	len = sizeof(struct ipv6hdr) + ntohs(ipv6_hdr(skb)->payload_len);
4965	while (off <= len && !done) {
4966		switch (nexthdr) {
4967		case IPPROTO_DSTOPTS:
4968		case IPPROTO_HOPOPTS:
4969		case IPPROTO_ROUTING: {
4970			struct ipv6_opt_hdr *hp;
4971
4972			err = skb_maybe_pull_tail(skb,
4973						  off +
4974						  sizeof(struct ipv6_opt_hdr),
4975						  MAX_IPV6_HDR_LEN);
4976			if (err < 0)
4977				goto out;
4978
4979			hp = OPT_HDR(struct ipv6_opt_hdr, skb, off);
4980			nexthdr = hp->nexthdr;
4981			off += ipv6_optlen(hp);
4982			break;
4983		}
4984		case IPPROTO_AH: {
4985			struct ip_auth_hdr *hp;
4986
4987			err = skb_maybe_pull_tail(skb,
4988						  off +
4989						  sizeof(struct ip_auth_hdr),
4990						  MAX_IPV6_HDR_LEN);
4991			if (err < 0)
4992				goto out;
4993
4994			hp = OPT_HDR(struct ip_auth_hdr, skb, off);
4995			nexthdr = hp->nexthdr;
4996			off += ipv6_authlen(hp);
4997			break;
4998		}
4999		case IPPROTO_FRAGMENT: {
5000			struct frag_hdr *hp;
5001
5002			err = skb_maybe_pull_tail(skb,
5003						  off +
5004						  sizeof(struct frag_hdr),
5005						  MAX_IPV6_HDR_LEN);
5006			if (err < 0)
5007				goto out;
5008
5009			hp = OPT_HDR(struct frag_hdr, skb, off);
5010
5011			if (hp->frag_off & htons(IP6_OFFSET | IP6_MF))
5012				fragment = true;
5013
5014			nexthdr = hp->nexthdr;
5015			off += sizeof(struct frag_hdr);
5016			break;
5017		}
5018		default:
5019			done = true;
5020			break;
5021		}
5022	}
5023
5024	err = -EPROTO;
5025
5026	if (!done || fragment)
5027		goto out;
5028
5029	csum = skb_checksum_setup_ip(skb, nexthdr, off);
5030	if (IS_ERR(csum))
5031		return PTR_ERR(csum);
5032
5033	if (recalculate)
5034		*csum = ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
5035					 &ipv6_hdr(skb)->daddr,
5036					 skb->len - off, nexthdr, 0);
5037	err = 0;
5038
5039out:
5040	return err;
5041}
5042
5043/**
5044 * skb_checksum_setup - set up partial checksum offset
5045 * @skb: the skb to set up
5046 * @recalculate: if true the pseudo-header checksum will be recalculated
5047 */
5048int skb_checksum_setup(struct sk_buff *skb, bool recalculate)
5049{
5050	int err;
5051
5052	switch (skb->protocol) {
5053	case htons(ETH_P_IP):
5054		err = skb_checksum_setup_ipv4(skb, recalculate);
5055		break;
5056
5057	case htons(ETH_P_IPV6):
5058		err = skb_checksum_setup_ipv6(skb, recalculate);
5059		break;
5060
5061	default:
5062		err = -EPROTO;
5063		break;
5064	}
5065
5066	return err;
5067}
5068EXPORT_SYMBOL(skb_checksum_setup);
5069
5070/**
5071 * skb_checksum_maybe_trim - maybe trims the given skb
5072 * @skb: the skb to check
5073 * @transport_len: the data length beyond the network header
5074 *
5075 * Checks whether the given skb has data beyond the given transport length.
5076 * If so, returns a cloned skb trimmed to this transport length.
5077 * Otherwise returns the provided skb. Returns NULL in error cases
5078 * (e.g. transport_len exceeds skb length or out-of-memory).
5079 *
5080 * Caller needs to set the skb transport header and free any returned skb if it
5081 * differs from the provided skb.
5082 */
5083static struct sk_buff *skb_checksum_maybe_trim(struct sk_buff *skb,
5084					       unsigned int transport_len)
5085{
5086	struct sk_buff *skb_chk;
5087	unsigned int len = skb_transport_offset(skb) + transport_len;
5088	int ret;
5089
5090	if (skb->len < len)
5091		return NULL;
5092	else if (skb->len == len)
5093		return skb;
5094
5095	skb_chk = skb_clone(skb, GFP_ATOMIC);
5096	if (!skb_chk)
5097		return NULL;
5098
5099	ret = pskb_trim_rcsum(skb_chk, len);
5100	if (ret) {
5101		kfree_skb(skb_chk);
5102		return NULL;
5103	}
5104
5105	return skb_chk;
5106}
5107
5108/**
5109 * skb_checksum_trimmed - validate checksum of an skb
5110 * @skb: the skb to check
5111 * @transport_len: the data length beyond the network header
5112 * @skb_chkf: checksum function to use
5113 *
5114 * Applies the given checksum function skb_chkf to the provided skb.
5115 * Returns a checked and maybe trimmed skb. Returns NULL on error.
5116 *
5117 * If the skb has data beyond the given transport length, then a
5118 * trimmed & cloned skb is checked and returned.
5119 *
5120 * Caller needs to set the skb transport header and free any returned skb if it
5121 * differs from the provided skb.
5122 */
5123struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb,
5124				     unsigned int transport_len,
5125				     __sum16(*skb_chkf)(struct sk_buff *skb))
5126{
5127	struct sk_buff *skb_chk;
5128	unsigned int offset = skb_transport_offset(skb);
5129	__sum16 ret;
5130
5131	skb_chk = skb_checksum_maybe_trim(skb, transport_len);
5132	if (!skb_chk)
5133		goto err;
5134
5135	if (!pskb_may_pull(skb_chk, offset))
5136		goto err;
5137
5138	skb_pull_rcsum(skb_chk, offset);
5139	ret = skb_chkf(skb_chk);
5140	skb_push_rcsum(skb_chk, offset);
5141
5142	if (ret)
5143		goto err;
5144
5145	return skb_chk;
5146
5147err:
5148	if (skb_chk && skb_chk != skb)
5149		kfree_skb(skb_chk);
5150
5151	return NULL;
5152
5153}
5154EXPORT_SYMBOL(skb_checksum_trimmed);
5155
5156void __skb_warn_lro_forwarding(const struct sk_buff *skb)
5157{
5158	net_warn_ratelimited("%s: received packets cannot be forwarded while LRO is enabled\n",
5159			     skb->dev->name);
5160}
5161EXPORT_SYMBOL(__skb_warn_lro_forwarding);
5162
5163void kfree_skb_partial(struct sk_buff *skb, bool head_stolen)
5164{
5165	if (head_stolen) {
5166		skb_release_head_state(skb);
5167		kmem_cache_free(skbuff_head_cache, skb);
5168	} else {
5169		__kfree_skb(skb);
5170	}
5171}
5172EXPORT_SYMBOL(kfree_skb_partial);
5173
5174/**
5175 * skb_try_coalesce - try to merge skb to prior one
5176 * @to: prior buffer
5177 * @from: buffer to add
5178 * @fragstolen: pointer to boolean
5179 * @delta_truesize: how much more was allocated than was requested
5180 */
5181bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
5182		      bool *fragstolen, int *delta_truesize)
5183{
5184	struct skb_shared_info *to_shinfo, *from_shinfo;
5185	int i, delta, len = from->len;
5186
5187	*fragstolen = false;
5188
5189	if (skb_cloned(to))
5190		return false;
5191
5192	if (len <= skb_tailroom(to)) {
5193		if (len)
5194			BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len));
5195		*delta_truesize = 0;
5196		return true;
5197	}
5198
5199	to_shinfo = skb_shinfo(to);
5200	from_shinfo = skb_shinfo(from);
5201	if (to_shinfo->frag_list || from_shinfo->frag_list)
5202		return false;
5203	if (skb_zcopy(to) || skb_zcopy(from))
5204		return false;
5205
5206	if (skb_headlen(from) != 0) {
5207		struct page *page;
5208		unsigned int offset;
5209
5210		if (to_shinfo->nr_frags +
5211		    from_shinfo->nr_frags >= MAX_SKB_FRAGS)
5212			return false;
5213
5214		if (skb_head_is_locked(from))
5215			return false;
5216
5217		delta = from->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff));
5218
5219		page = virt_to_head_page(from->head);
5220		offset = from->data - (unsigned char *)page_address(page);
5221
5222		skb_fill_page_desc(to, to_shinfo->nr_frags,
5223				   page, offset, skb_headlen(from));
5224		*fragstolen = true;
5225	} else {
5226		if (to_shinfo->nr_frags +
5227		    from_shinfo->nr_frags > MAX_SKB_FRAGS)
5228			return false;
5229
5230		delta = from->truesize - SKB_TRUESIZE(skb_end_offset(from));
5231	}
5232
5233	WARN_ON_ONCE(delta < len);
5234
5235	memcpy(to_shinfo->frags + to_shinfo->nr_frags,
5236	       from_shinfo->frags,
5237	       from_shinfo->nr_frags * sizeof(skb_frag_t));
5238	to_shinfo->nr_frags += from_shinfo->nr_frags;
5239
5240	if (!skb_cloned(from))
5241		from_shinfo->nr_frags = 0;
5242
5243	/* if the skb is not cloned this does nothing
5244	 * since we set nr_frags to 0.
5245	 */
5246	for (i = 0; i < from_shinfo->nr_frags; i++)
5247		__skb_frag_ref(&from_shinfo->frags[i]);
5248
5249	to->truesize += delta;
5250	to->len += len;
5251	to->data_len += len;
5252
5253	*delta_truesize = delta;
5254	return true;
5255}
5256EXPORT_SYMBOL(skb_try_coalesce);
5257
5258/**
5259 * skb_scrub_packet - scrub an skb
5260 *
5261 * @skb: buffer to clean
5262 * @xnet: packet is crossing netns
5263 *
5264 * skb_scrub_packet can be used after encapsulating or decapsulting a packet
5265 * into/from a tunnel. Some information have to be cleared during these
5266 * operations.
5267 * skb_scrub_packet can also be used to clean a skb before injecting it in
5268 * another namespace (@xnet == true). We have to clear all information in the
5269 * skb that could impact namespace isolation.
5270 */
5271void skb_scrub_packet(struct sk_buff *skb, bool xnet)
5272{
5273	skb->pkt_type = PACKET_HOST;
5274	skb->skb_iif = 0;
5275	skb->ignore_df = 0;
5276	skb_dst_drop(skb);
5277	skb_ext_reset(skb);
5278	nf_reset_ct(skb);
5279	nf_reset_trace(skb);
5280
5281#ifdef CONFIG_NET_SWITCHDEV
5282	skb->offload_fwd_mark = 0;
5283	skb->offload_l3_fwd_mark = 0;
5284#endif
5285
5286	if (!xnet)
5287		return;
5288
5289	ipvs_reset(skb);
5290	skb->mark = 0;
5291	skb->tstamp = 0;
5292}
5293EXPORT_SYMBOL_GPL(skb_scrub_packet);
5294
5295/**
5296 * skb_gso_transport_seglen - Return length of individual segments of a gso packet
5297 *
5298 * @skb: GSO skb
5299 *
5300 * skb_gso_transport_seglen is used to determine the real size of the
5301 * individual segments, including Layer4 headers (TCP/UDP).
5302 *
5303 * The MAC/L2 or network (IP, IPv6) headers are not accounted for.
5304 */
5305static unsigned int skb_gso_transport_seglen(const struct sk_buff *skb)
5306{
5307	const struct skb_shared_info *shinfo = skb_shinfo(skb);
5308	unsigned int thlen = 0;
5309
5310	if (skb->encapsulation) {
5311		thlen = skb_inner_transport_header(skb) -
5312			skb_transport_header(skb);
5313
5314		if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
5315			thlen += inner_tcp_hdrlen(skb);
5316	} else if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
5317		thlen = tcp_hdrlen(skb);
5318	} else if (unlikely(skb_is_gso_sctp(skb))) {
5319		thlen = sizeof(struct sctphdr);
5320	} else if (shinfo->gso_type & SKB_GSO_UDP_L4) {
5321		thlen = sizeof(struct udphdr);
5322	}
5323	/* UFO sets gso_size to the size of the fragmentation
5324	 * payload, i.e. the size of the L4 (UDP) header is already
5325	 * accounted for.
5326	 */
5327	return thlen + shinfo->gso_size;
5328}
5329
5330/**
5331 * skb_gso_network_seglen - Return length of individual segments of a gso packet
5332 *
5333 * @skb: GSO skb
5334 *
5335 * skb_gso_network_seglen is used to determine the real size of the
5336 * individual segments, including Layer3 (IP, IPv6) and L4 headers (TCP/UDP).
5337 *
5338 * The MAC/L2 header is not accounted for.
5339 */
5340static unsigned int skb_gso_network_seglen(const struct sk_buff *skb)
5341{
5342	unsigned int hdr_len = skb_transport_header(skb) -
5343			       skb_network_header(skb);
5344
5345	return hdr_len + skb_gso_transport_seglen(skb);
5346}
5347
5348/**
5349 * skb_gso_mac_seglen - Return length of individual segments of a gso packet
5350 *
5351 * @skb: GSO skb
5352 *
5353 * skb_gso_mac_seglen is used to determine the real size of the
5354 * individual segments, including MAC/L2, Layer3 (IP, IPv6) and L4
5355 * headers (TCP/UDP).
5356 */
5357static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb)
5358{
5359	unsigned int hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
5360
5361	return hdr_len + skb_gso_transport_seglen(skb);
5362}
5363
5364/**
5365 * skb_gso_size_check - check the skb size, considering GSO_BY_FRAGS
5366 *
5367 * There are a couple of instances where we have a GSO skb, and we
5368 * want to determine what size it would be after it is segmented.
5369 *
5370 * We might want to check:
5371 * -    L3+L4+payload size (e.g. IP forwarding)
5372 * - L2+L3+L4+payload size (e.g. sanity check before passing to driver)
5373 *
5374 * This is a helper to do that correctly considering GSO_BY_FRAGS.
5375 *
5376 * @skb: GSO skb
5377 *
5378 * @seg_len: The segmented length (from skb_gso_*_seglen). In the
5379 *           GSO_BY_FRAGS case this will be [header sizes + GSO_BY_FRAGS].
5380 *
5381 * @max_len: The maximum permissible length.
5382 *
5383 * Returns true if the segmented length <= max length.
5384 */
5385static inline bool skb_gso_size_check(const struct sk_buff *skb,
5386				      unsigned int seg_len,
5387				      unsigned int max_len) {
5388	const struct skb_shared_info *shinfo = skb_shinfo(skb);
5389	const struct sk_buff *iter;
5390
5391	if (shinfo->gso_size != GSO_BY_FRAGS)
5392		return seg_len <= max_len;
5393
5394	/* Undo this so we can re-use header sizes */
5395	seg_len -= GSO_BY_FRAGS;
5396
5397	skb_walk_frags(skb, iter) {
5398		if (seg_len + skb_headlen(iter) > max_len)
5399			return false;
5400	}
5401
5402	return true;
5403}
5404
5405/**
5406 * skb_gso_validate_network_len - Will a split GSO skb fit into a given MTU?
5407 *
5408 * @skb: GSO skb
5409 * @mtu: MTU to validate against
5410 *
5411 * skb_gso_validate_network_len validates if a given skb will fit a
5412 * wanted MTU once split. It considers L3 headers, L4 headers, and the
5413 * payload.
5414 */
5415bool skb_gso_validate_network_len(const struct sk_buff *skb, unsigned int mtu)
5416{
5417	return skb_gso_size_check(skb, skb_gso_network_seglen(skb), mtu);
5418}
5419EXPORT_SYMBOL_GPL(skb_gso_validate_network_len);
5420
5421/**
5422 * skb_gso_validate_mac_len - Will a split GSO skb fit in a given length?
5423 *
5424 * @skb: GSO skb
5425 * @len: length to validate against
5426 *
5427 * skb_gso_validate_mac_len validates if a given skb will fit a wanted
5428 * length once split, including L2, L3 and L4 headers and the payload.
5429 */
5430bool skb_gso_validate_mac_len(const struct sk_buff *skb, unsigned int len)
5431{
5432	return skb_gso_size_check(skb, skb_gso_mac_seglen(skb), len);
5433}
5434EXPORT_SYMBOL_GPL(skb_gso_validate_mac_len);
5435
5436static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb)
5437{
5438	int mac_len, meta_len;
5439	void *meta;
5440
5441	if (skb_cow(skb, skb_headroom(skb)) < 0) {
5442		kfree_skb(skb);
5443		return NULL;
5444	}
5445
5446	mac_len = skb->data - skb_mac_header(skb);
5447	if (likely(mac_len > VLAN_HLEN + ETH_TLEN)) {
5448		memmove(skb_mac_header(skb) + VLAN_HLEN, skb_mac_header(skb),
5449			mac_len - VLAN_HLEN - ETH_TLEN);
5450	}
5451
5452	meta_len = skb_metadata_len(skb);
5453	if (meta_len) {
5454		meta = skb_metadata_end(skb) - meta_len;
5455		memmove(meta + VLAN_HLEN, meta, meta_len);
5456	}
5457
5458	skb->mac_header += VLAN_HLEN;
5459	return skb;
5460}
5461
5462struct sk_buff *skb_vlan_untag(struct sk_buff *skb)
5463{
5464	struct vlan_hdr *vhdr;
5465	u16 vlan_tci;
5466
5467	if (unlikely(skb_vlan_tag_present(skb))) {
5468		/* vlan_tci is already set-up so leave this for another time */
5469		return skb;
5470	}
5471
5472	skb = skb_share_check(skb, GFP_ATOMIC);
5473	if (unlikely(!skb))
5474		goto err_free;
5475	/* We may access the two bytes after vlan_hdr in vlan_set_encap_proto(). */
5476	if (unlikely(!pskb_may_pull(skb, VLAN_HLEN + sizeof(unsigned short))))
5477		goto err_free;
5478
5479	vhdr = (struct vlan_hdr *)skb->data;
5480	vlan_tci = ntohs(vhdr->h_vlan_TCI);
5481	__vlan_hwaccel_put_tag(skb, skb->protocol, vlan_tci);
5482
5483	skb_pull_rcsum(skb, VLAN_HLEN);
5484	vlan_set_encap_proto(skb, vhdr);
5485
5486	skb = skb_reorder_vlan_header(skb);
5487	if (unlikely(!skb))
5488		goto err_free;
5489
5490	skb_reset_network_header(skb);
5491	if (!skb_transport_header_was_set(skb))
5492		skb_reset_transport_header(skb);
5493	skb_reset_mac_len(skb);
5494
5495	return skb;
5496
5497err_free:
5498	kfree_skb(skb);
5499	return NULL;
5500}
5501EXPORT_SYMBOL(skb_vlan_untag);
5502
5503int skb_ensure_writable(struct sk_buff *skb, int write_len)
5504{
5505	if (!pskb_may_pull(skb, write_len))
5506		return -ENOMEM;
5507
5508	if (!skb_cloned(skb) || skb_clone_writable(skb, write_len))
5509		return 0;
5510
5511	return pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
5512}
5513EXPORT_SYMBOL(skb_ensure_writable);
5514
5515/* remove VLAN header from packet and update csum accordingly.
5516 * expects a non skb_vlan_tag_present skb with a vlan tag payload
5517 */
5518int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci)
5519{
5520	struct vlan_hdr *vhdr;
5521	int offset = skb->data - skb_mac_header(skb);
5522	int err;
5523
5524	if (WARN_ONCE(offset,
5525		      "__skb_vlan_pop got skb with skb->data not at mac header (offset %d)\n",
5526		      offset)) {
5527		return -EINVAL;
5528	}
5529
5530	err = skb_ensure_writable(skb, VLAN_ETH_HLEN);
5531	if (unlikely(err))
5532		return err;
5533
5534	skb_postpull_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN);
5535
5536	vhdr = (struct vlan_hdr *)(skb->data + ETH_HLEN);
5537	*vlan_tci = ntohs(vhdr->h_vlan_TCI);
5538
5539	memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN);
5540	__skb_pull(skb, VLAN_HLEN);
5541
5542	vlan_set_encap_proto(skb, vhdr);
5543	skb->mac_header += VLAN_HLEN;
5544
5545	if (skb_network_offset(skb) < ETH_HLEN)
5546		skb_set_network_header(skb, ETH_HLEN);
5547
5548	skb_reset_mac_len(skb);
5549
5550	return err;
5551}
5552EXPORT_SYMBOL(__skb_vlan_pop);
5553
5554/* Pop a vlan tag either from hwaccel or from payload.
5555 * Expects skb->data at mac header.
5556 */
5557int skb_vlan_pop(struct sk_buff *skb)
5558{
5559	u16 vlan_tci;
5560	__be16 vlan_proto;
5561	int err;
5562
5563	if (likely(skb_vlan_tag_present(skb))) {
5564		__vlan_hwaccel_clear_tag(skb);
5565	} else {
5566		if (unlikely(!eth_type_vlan(skb->protocol)))
5567			return 0;
5568
5569		err = __skb_vlan_pop(skb, &vlan_tci);
5570		if (err)
5571			return err;
5572	}
5573	/* move next vlan tag to hw accel tag */
5574	if (likely(!eth_type_vlan(skb->protocol)))
5575		return 0;
5576
5577	vlan_proto = skb->protocol;
5578	err = __skb_vlan_pop(skb, &vlan_tci);
5579	if (unlikely(err))
5580		return err;
5581
5582	__vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci);
5583	return 0;
5584}
5585EXPORT_SYMBOL(skb_vlan_pop);
5586
5587/* Push a vlan tag either into hwaccel or into payload (if hwaccel tag present).
5588 * Expects skb->data at mac header.
5589 */
5590int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
5591{
5592	if (skb_vlan_tag_present(skb)) {
5593		int offset = skb->data - skb_mac_header(skb);
5594		int err;
5595
5596		if (WARN_ONCE(offset,
5597			      "skb_vlan_push got skb with skb->data not at mac header (offset %d)\n",
5598			      offset)) {
5599			return -EINVAL;
5600		}
5601
5602		err = __vlan_insert_tag(skb, skb->vlan_proto,
5603					skb_vlan_tag_get(skb));
5604		if (err)
5605			return err;
5606
5607		skb->protocol = skb->vlan_proto;
5608		skb->mac_len += VLAN_HLEN;
5609
5610		skb_postpush_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN);
5611	}
5612	__vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci);
5613	return 0;
5614}
5615EXPORT_SYMBOL(skb_vlan_push);
5616
5617/**
5618 * skb_eth_pop() - Drop the Ethernet header at the head of a packet
5619 *
5620 * @skb: Socket buffer to modify
5621 *
5622 * Drop the Ethernet header of @skb.
5623 *
5624 * Expects that skb->data points to the mac header and that no VLAN tags are
5625 * present.
5626 *
5627 * Returns 0 on success, -errno otherwise.
5628 */
5629int skb_eth_pop(struct sk_buff *skb)
5630{
5631	if (!pskb_may_pull(skb, ETH_HLEN) || skb_vlan_tagged(skb) ||
5632	    skb_network_offset(skb) < ETH_HLEN)
5633		return -EPROTO;
5634
5635	skb_pull_rcsum(skb, ETH_HLEN);
5636	skb_reset_mac_header(skb);
5637	skb_reset_mac_len(skb);
5638
5639	return 0;
5640}
5641EXPORT_SYMBOL(skb_eth_pop);
5642
5643/**
5644 * skb_eth_push() - Add a new Ethernet header at the head of a packet
5645 *
5646 * @skb: Socket buffer to modify
5647 * @dst: Destination MAC address of the new header
5648 * @src: Source MAC address of the new header
5649 *
5650 * Prepend @skb with a new Ethernet header.
5651 *
5652 * Expects that skb->data points to the mac header, which must be empty.
5653 *
5654 * Returns 0 on success, -errno otherwise.
5655 */
5656int skb_eth_push(struct sk_buff *skb, const unsigned char *dst,
5657		 const unsigned char *src)
5658{
5659	struct ethhdr *eth;
5660	int err;
5661
5662	if (skb_network_offset(skb) || skb_vlan_tag_present(skb))
5663		return -EPROTO;
5664
5665	err = skb_cow_head(skb, sizeof(*eth));
5666	if (err < 0)
5667		return err;
5668
5669	skb_push(skb, sizeof(*eth));
5670	skb_reset_mac_header(skb);
5671	skb_reset_mac_len(skb);
5672
5673	eth = eth_hdr(skb);
5674	ether_addr_copy(eth->h_dest, dst);
5675	ether_addr_copy(eth->h_source, src);
5676	eth->h_proto = skb->protocol;
5677
5678	skb_postpush_rcsum(skb, eth, sizeof(*eth));
5679
5680	return 0;
5681}
5682EXPORT_SYMBOL(skb_eth_push);
5683
5684/* Update the ethertype of hdr and the skb csum value if required. */
5685static void skb_mod_eth_type(struct sk_buff *skb, struct ethhdr *hdr,
5686			     __be16 ethertype)
5687{
5688	if (skb->ip_summed == CHECKSUM_COMPLETE) {
5689		__be16 diff[] = { ~hdr->h_proto, ethertype };
5690
5691		skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum);
5692	}
5693
5694	hdr->h_proto = ethertype;
5695}
5696
5697/**
5698 * skb_mpls_push() - push a new MPLS header after mac_len bytes from start of
5699 *                   the packet
5700 *
5701 * @skb: buffer
5702 * @mpls_lse: MPLS label stack entry to push
5703 * @mpls_proto: ethertype of the new MPLS header (expects 0x8847 or 0x8848)
5704 * @mac_len: length of the MAC header
5705 * @ethernet: flag to indicate if the resulting packet after skb_mpls_push is
5706 *            ethernet
5707 *
5708 * Expects skb->data at mac header.
5709 *
5710 * Returns 0 on success, -errno otherwise.
5711 */
5712int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto,
5713		  int mac_len, bool ethernet)
5714{
5715	struct mpls_shim_hdr *lse;
5716	int err;
5717
5718	if (unlikely(!eth_p_mpls(mpls_proto)))
5719		return -EINVAL;
5720
5721	/* Networking stack does not allow simultaneous Tunnel and MPLS GSO. */
5722	if (skb->encapsulation)
5723		return -EINVAL;
5724
5725	err = skb_cow_head(skb, MPLS_HLEN);
5726	if (unlikely(err))
5727		return err;
5728
5729	if (!skb->inner_protocol) {
5730		skb_set_inner_network_header(skb, skb_network_offset(skb));
5731		skb_set_inner_protocol(skb, skb->protocol);
5732	}
5733
5734	skb_push(skb, MPLS_HLEN);
5735	memmove(skb_mac_header(skb) - MPLS_HLEN, skb_mac_header(skb),
5736		mac_len);
5737	skb_reset_mac_header(skb);
5738	skb_set_network_header(skb, mac_len);
5739	skb_reset_mac_len(skb);
5740
5741	lse = mpls_hdr(skb);
5742	lse->label_stack_entry = mpls_lse;
5743	skb_postpush_rcsum(skb, lse, MPLS_HLEN);
5744
5745	if (ethernet && mac_len >= ETH_HLEN)
5746		skb_mod_eth_type(skb, eth_hdr(skb), mpls_proto);
5747	skb->protocol = mpls_proto;
5748
5749	return 0;
5750}
5751EXPORT_SYMBOL_GPL(skb_mpls_push);
5752
5753/**
5754 * skb_mpls_pop() - pop the outermost MPLS header
5755 *
5756 * @skb: buffer
5757 * @next_proto: ethertype of header after popped MPLS header
5758 * @mac_len: length of the MAC header
5759 * @ethernet: flag to indicate if the packet is ethernet
5760 *
5761 * Expects skb->data at mac header.
5762 *
5763 * Returns 0 on success, -errno otherwise.
5764 */
5765int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto, int mac_len,
5766		 bool ethernet)
5767{
5768	int err;
5769
5770	if (unlikely(!eth_p_mpls(skb->protocol)))
5771		return 0;
5772
5773	err = skb_ensure_writable(skb, mac_len + MPLS_HLEN);
5774	if (unlikely(err))
5775		return err;
5776
5777	skb_postpull_rcsum(skb, mpls_hdr(skb), MPLS_HLEN);
5778	memmove(skb_mac_header(skb) + MPLS_HLEN, skb_mac_header(skb),
5779		mac_len);
5780
5781	__skb_pull(skb, MPLS_HLEN);
5782	skb_reset_mac_header(skb);
5783	skb_set_network_header(skb, mac_len);
5784
5785	if (ethernet && mac_len >= ETH_HLEN) {
5786		struct ethhdr *hdr;
5787
5788		/* use mpls_hdr() to get ethertype to account for VLANs. */
5789		hdr = (struct ethhdr *)((void *)mpls_hdr(skb) - ETH_HLEN);
5790		skb_mod_eth_type(skb, hdr, next_proto);
5791	}
5792	skb->protocol = next_proto;
5793
5794	return 0;
5795}
5796EXPORT_SYMBOL_GPL(skb_mpls_pop);
5797
5798/**
5799 * skb_mpls_update_lse() - modify outermost MPLS header and update csum
5800 *
5801 * @skb: buffer
5802 * @mpls_lse: new MPLS label stack entry to update to
5803 *
5804 * Expects skb->data at mac header.
5805 *
5806 * Returns 0 on success, -errno otherwise.
5807 */
5808int skb_mpls_update_lse(struct sk_buff *skb, __be32 mpls_lse)
5809{
5810	int err;
5811
5812	if (unlikely(!eth_p_mpls(skb->protocol)))
5813		return -EINVAL;
5814
5815	err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN);
5816	if (unlikely(err))
5817		return err;
5818
5819	if (skb->ip_summed == CHECKSUM_COMPLETE) {
5820		__be32 diff[] = { ~mpls_hdr(skb)->label_stack_entry, mpls_lse };
5821
5822		skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum);
5823	}
5824
5825	mpls_hdr(skb)->label_stack_entry = mpls_lse;
5826
5827	return 0;
5828}
5829EXPORT_SYMBOL_GPL(skb_mpls_update_lse);
5830
5831/**
5832 * skb_mpls_dec_ttl() - decrement the TTL of the outermost MPLS header
5833 *
5834 * @skb: buffer
5835 *
5836 * Expects skb->data at mac header.
5837 *
5838 * Returns 0 on success, -errno otherwise.
5839 */
5840int skb_mpls_dec_ttl(struct sk_buff *skb)
5841{
5842	u32 lse;
5843	u8 ttl;
5844
5845	if (unlikely(!eth_p_mpls(skb->protocol)))
5846		return -EINVAL;
5847
5848	if (!pskb_may_pull(skb, skb_network_offset(skb) + MPLS_HLEN))
5849		return -ENOMEM;
5850
5851	lse = be32_to_cpu(mpls_hdr(skb)->label_stack_entry);
5852	ttl = (lse & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT;
5853	if (!--ttl)
5854		return -EINVAL;
5855
5856	lse &= ~MPLS_LS_TTL_MASK;
5857	lse |= ttl << MPLS_LS_TTL_SHIFT;
5858
5859	return skb_mpls_update_lse(skb, cpu_to_be32(lse));
5860}
5861EXPORT_SYMBOL_GPL(skb_mpls_dec_ttl);
5862
5863/**
5864 * alloc_skb_with_frags - allocate skb with page frags
5865 *
5866 * @header_len: size of linear part
5867 * @data_len: needed length in frags
5868 * @max_page_order: max page order desired.
5869 * @errcode: pointer to error code if any
5870 * @gfp_mask: allocation mask
5871 *
5872 * This can be used to allocate a paged skb, given a maximal order for frags.
5873 */
5874struct sk_buff *alloc_skb_with_frags(unsigned long header_len,
5875				     unsigned long data_len,
5876				     int max_page_order,
5877				     int *errcode,
5878				     gfp_t gfp_mask)
5879{
5880	int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
5881	unsigned long chunk;
5882	struct sk_buff *skb;
5883	struct page *page;
5884	int i;
5885
5886	*errcode = -EMSGSIZE;
5887	/* Note this test could be relaxed, if we succeed to allocate
5888	 * high order pages...
5889	 */
5890	if (npages > MAX_SKB_FRAGS)
5891		return NULL;
5892
5893	*errcode = -ENOBUFS;
5894	skb = alloc_skb(header_len, gfp_mask);
5895	if (!skb)
5896		return NULL;
5897
5898	skb->truesize += npages << PAGE_SHIFT;
5899
5900	for (i = 0; npages > 0; i++) {
5901		int order = max_page_order;
5902
5903		while (order) {
5904			if (npages >= 1 << order) {
5905				page = alloc_pages((gfp_mask & ~__GFP_DIRECT_RECLAIM) |
5906						   __GFP_COMP |
5907						   __GFP_NOWARN,
5908						   order);
5909				if (page)
5910					goto fill_page;
5911				/* Do not retry other high order allocations */
5912				order = 1;
5913				max_page_order = 0;
5914			}
5915			order--;
5916		}
5917		page = alloc_page(gfp_mask);
5918		if (!page)
5919			goto failure;
5920fill_page:
5921		chunk = min_t(unsigned long, data_len,
5922			      PAGE_SIZE << order);
5923		skb_fill_page_desc(skb, i, page, 0, chunk);
5924		data_len -= chunk;
5925		npages -= 1 << order;
5926	}
5927	return skb;
5928
5929failure:
5930	kfree_skb(skb);
5931	return NULL;
5932}
5933EXPORT_SYMBOL(alloc_skb_with_frags);
5934
5935/* carve out the first off bytes from skb when off < headlen */
5936static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off,
5937				    const int headlen, gfp_t gfp_mask)
5938{
5939	int i;
5940	int size = skb_end_offset(skb);
5941	int new_hlen = headlen - off;
5942	u8 *data;
5943
5944	size = SKB_DATA_ALIGN(size);
5945
5946	if (skb_pfmemalloc(skb))
5947		gfp_mask |= __GFP_MEMALLOC;
5948	data = kmalloc_reserve(size +
5949			       SKB_DATA_ALIGN(sizeof(struct skb_shared_info)),
5950			       gfp_mask, NUMA_NO_NODE, NULL);
5951	if (!data)
5952		return -ENOMEM;
5953
5954	size = SKB_WITH_OVERHEAD(ksize(data));
5955
5956	/* Copy real data, and all frags */
5957	skb_copy_from_linear_data_offset(skb, off, data, new_hlen);
5958	skb->len -= off;
5959
5960	memcpy((struct skb_shared_info *)(data + size),
5961	       skb_shinfo(skb),
5962	       offsetof(struct skb_shared_info,
5963			frags[skb_shinfo(skb)->nr_frags]));
5964	if (skb_cloned(skb)) {
5965		/* drop the old head gracefully */
5966		if (skb_orphan_frags(skb, gfp_mask)) {
5967			kfree(data);
5968			return -ENOMEM;
5969		}
5970		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
5971			skb_frag_ref(skb, i);
5972		if (skb_has_frag_list(skb))
5973			skb_clone_fraglist(skb);
5974		skb_release_data(skb);
5975	} else {
5976		/* we can reuse existing recount- all we did was
5977		 * relocate values
5978		 */
5979		skb_free_head(skb);
5980	}
5981
5982	skb->head = data;
5983	skb->data = data;
5984	skb->head_frag = 0;
5985#ifdef NET_SKBUFF_DATA_USES_OFFSET
5986	skb->end = size;
5987#else
5988	skb->end = skb->head + size;
5989#endif
5990	skb_set_tail_pointer(skb, skb_headlen(skb));
5991	skb_headers_offset_update(skb, 0);
5992	skb->cloned = 0;
5993	skb->hdr_len = 0;
5994	skb->nohdr = 0;
5995	atomic_set(&skb_shinfo(skb)->dataref, 1);
5996
5997	return 0;
5998}
5999
6000static int pskb_carve(struct sk_buff *skb, const u32 off, gfp_t gfp);
6001
6002/* carve out the first eat bytes from skb's frag_list. May recurse into
6003 * pskb_carve()
6004 */
6005static int pskb_carve_frag_list(struct sk_buff *skb,
6006				struct skb_shared_info *shinfo, int eat,
6007				gfp_t gfp_mask)
6008{
6009	struct sk_buff *list = shinfo->frag_list;
6010	struct sk_buff *clone = NULL;
6011	struct sk_buff *insp = NULL;
6012
6013	do {
6014		if (!list) {
6015			pr_err("Not enough bytes to eat. Want %d\n", eat);
6016			return -EFAULT;
6017		}
6018		if (list->len <= eat) {
6019			/* Eaten as whole. */
6020			eat -= list->len;
6021			list = list->next;
6022			insp = list;
6023		} else {
6024			/* Eaten partially. */
6025			if (skb_shared(list)) {
6026				clone = skb_clone(list, gfp_mask);
6027				if (!clone)
6028					return -ENOMEM;
6029				insp = list->next;
6030				list = clone;
6031			} else {
6032				/* This may be pulled without problems. */
6033				insp = list;
6034			}
6035			if (pskb_carve(list, eat, gfp_mask) < 0) {
6036				kfree_skb(clone);
6037				return -ENOMEM;
6038			}
6039			break;
6040		}
6041	} while (eat);
6042
6043	/* Free pulled out fragments. */
6044	while ((list = shinfo->frag_list) != insp) {
6045		shinfo->frag_list = list->next;
6046		kfree_skb(list);
6047	}
6048	/* And insert new clone at head. */
6049	if (clone) {
6050		clone->next = list;
6051		shinfo->frag_list = clone;
6052	}
6053	return 0;
6054}
6055
6056/* carve off first len bytes from skb. Split line (off) is in the
6057 * non-linear part of skb
6058 */
6059static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
6060				       int pos, gfp_t gfp_mask)
6061{
6062	int i, k = 0;
6063	int size = skb_end_offset(skb);
6064	u8 *data;
6065	const int nfrags = skb_shinfo(skb)->nr_frags;
6066	struct skb_shared_info *shinfo;
6067
6068	size = SKB_DATA_ALIGN(size);
6069
6070	if (skb_pfmemalloc(skb))
6071		gfp_mask |= __GFP_MEMALLOC;
6072	data = kmalloc_reserve(size +
6073			       SKB_DATA_ALIGN(sizeof(struct skb_shared_info)),
6074			       gfp_mask, NUMA_NO_NODE, NULL);
6075	if (!data)
6076		return -ENOMEM;
6077
6078	size = SKB_WITH_OVERHEAD(ksize(data));
6079
6080	memcpy((struct skb_shared_info *)(data + size),
6081	       skb_shinfo(skb), offsetof(struct skb_shared_info, frags[0]));
6082	if (skb_orphan_frags(skb, gfp_mask)) {
6083		kfree(data);
6084		return -ENOMEM;
6085	}
6086	shinfo = (struct skb_shared_info *)(data + size);
6087	for (i = 0; i < nfrags; i++) {
6088		int fsize = skb_frag_size(&skb_shinfo(skb)->frags[i]);
6089
6090		if (pos + fsize > off) {
6091			shinfo->frags[k] = skb_shinfo(skb)->frags[i];
6092
6093			if (pos < off) {
6094				/* Split frag.
6095				 * We have two variants in this case:
6096				 * 1. Move all the frag to the second
6097				 *    part, if it is possible. F.e.
6098				 *    this approach is mandatory for TUX,
6099				 *    where splitting is expensive.
6100				 * 2. Split is accurately. We make this.
6101				 */
6102				skb_frag_off_add(&shinfo->frags[0], off - pos);
6103				skb_frag_size_sub(&shinfo->frags[0], off - pos);
6104			}
6105			skb_frag_ref(skb, i);
6106			k++;
6107		}
6108		pos += fsize;
6109	}
6110	shinfo->nr_frags = k;
6111	if (skb_has_frag_list(skb))
6112		skb_clone_fraglist(skb);
6113
6114	/* split line is in frag list */
6115	if (k == 0 && pskb_carve_frag_list(skb, shinfo, off - pos, gfp_mask)) {
6116		/* skb_frag_unref() is not needed here as shinfo->nr_frags = 0. */
6117		if (skb_has_frag_list(skb))
6118			kfree_skb_list(skb_shinfo(skb)->frag_list);
6119		kfree(data);
6120		return -ENOMEM;
6121	}
6122	skb_release_data(skb);
6123
6124	skb->head = data;
6125	skb->head_frag = 0;
6126	skb->data = data;
6127#ifdef NET_SKBUFF_DATA_USES_OFFSET
6128	skb->end = size;
6129#else
6130	skb->end = skb->head + size;
6131#endif
6132	skb_reset_tail_pointer(skb);
6133	skb_headers_offset_update(skb, 0);
6134	skb->cloned   = 0;
6135	skb->hdr_len  = 0;
6136	skb->nohdr    = 0;
6137	skb->len -= off;
6138	skb->data_len = skb->len;
6139	atomic_set(&skb_shinfo(skb)->dataref, 1);
6140	return 0;
6141}
6142
6143/* remove len bytes from the beginning of the skb */
6144static int pskb_carve(struct sk_buff *skb, const u32 len, gfp_t gfp)
6145{
6146	int headlen = skb_headlen(skb);
6147
6148	if (len < headlen)
6149		return pskb_carve_inside_header(skb, len, headlen, gfp);
6150	else
6151		return pskb_carve_inside_nonlinear(skb, len, headlen, gfp);
6152}
6153
6154/* Extract to_copy bytes starting at off from skb, and return this in
6155 * a new skb
6156 */
6157struct sk_buff *pskb_extract(struct sk_buff *skb, int off,
6158			     int to_copy, gfp_t gfp)
6159{
6160	struct sk_buff  *clone = skb_clone(skb, gfp);
6161
6162	if (!clone)
6163		return NULL;
6164
6165	if (pskb_carve(clone, off, gfp) < 0 ||
6166	    pskb_trim(clone, to_copy)) {
6167		kfree_skb(clone);
6168		return NULL;
6169	}
6170	return clone;
6171}
6172EXPORT_SYMBOL(pskb_extract);
6173
6174/**
6175 * skb_condense - try to get rid of fragments/frag_list if possible
6176 * @skb: buffer
6177 *
6178 * Can be used to save memory before skb is added to a busy queue.
6179 * If packet has bytes in frags and enough tail room in skb->head,
6180 * pull all of them, so that we can free the frags right now and adjust
6181 * truesize.
6182 * Notes:
6183 *	We do not reallocate skb->head thus can not fail.
6184 *	Caller must re-evaluate skb->truesize if needed.
6185 */
6186void skb_condense(struct sk_buff *skb)
6187{
6188	if (skb->data_len) {
6189		if (skb->data_len > skb->end - skb->tail ||
6190		    skb_cloned(skb))
6191			return;
6192
6193		/* Nice, we can free page frag(s) right now */
6194		__pskb_pull_tail(skb, skb->data_len);
6195	}
6196	/* At this point, skb->truesize might be over estimated,
6197	 * because skb had a fragment, and fragments do not tell
6198	 * their truesize.
6199	 * When we pulled its content into skb->head, fragment
6200	 * was freed, but __pskb_pull_tail() could not possibly
6201	 * adjust skb->truesize, not knowing the frag truesize.
6202	 */
6203	skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
6204}
6205
6206#ifdef CONFIG_SKB_EXTENSIONS
6207static void *skb_ext_get_ptr(struct skb_ext *ext, enum skb_ext_id id)
6208{
6209	return (void *)ext + (ext->offset[id] * SKB_EXT_ALIGN_VALUE);
6210}
6211
6212/**
6213 * __skb_ext_alloc - allocate a new skb extensions storage
6214 *
6215 * @flags: See kmalloc().
6216 *
6217 * Returns the newly allocated pointer. The pointer can later attached to a
6218 * skb via __skb_ext_set().
6219 * Note: caller must handle the skb_ext as an opaque data.
6220 */
6221struct skb_ext *__skb_ext_alloc(gfp_t flags)
6222{
6223	struct skb_ext *new = kmem_cache_alloc(skbuff_ext_cache, flags);
6224
6225	if (new) {
6226		memset(new->offset, 0, sizeof(new->offset));
6227		refcount_set(&new->refcnt, 1);
6228	}
6229
6230	return new;
6231}
6232
6233static struct skb_ext *skb_ext_maybe_cow(struct skb_ext *old,
6234					 unsigned int old_active)
6235{
6236	struct skb_ext *new;
6237
6238	if (refcount_read(&old->refcnt) == 1)
6239		return old;
6240
6241	new = kmem_cache_alloc(skbuff_ext_cache, GFP_ATOMIC);
6242	if (!new)
6243		return NULL;
6244
6245	memcpy(new, old, old->chunks * SKB_EXT_ALIGN_VALUE);
6246	refcount_set(&new->refcnt, 1);
6247
6248#ifdef CONFIG_XFRM
6249	if (old_active & (1 << SKB_EXT_SEC_PATH)) {
6250		struct sec_path *sp = skb_ext_get_ptr(old, SKB_EXT_SEC_PATH);
6251		unsigned int i;
6252
6253		for (i = 0; i < sp->len; i++)
6254			xfrm_state_hold(sp->xvec[i]);
6255	}
6256#endif
6257	__skb_ext_put(old);
6258	return new;
6259}
6260
6261/**
6262 * __skb_ext_set - attach the specified extension storage to this skb
6263 * @skb: buffer
6264 * @id: extension id
6265 * @ext: extension storage previously allocated via __skb_ext_alloc()
6266 *
6267 * Existing extensions, if any, are cleared.
6268 *
6269 * Returns the pointer to the extension.
6270 */
6271void *__skb_ext_set(struct sk_buff *skb, enum skb_ext_id id,
6272		    struct skb_ext *ext)
6273{
6274	unsigned int newlen, newoff = SKB_EXT_CHUNKSIZEOF(*ext);
6275
6276	skb_ext_put(skb);
6277	newlen = newoff + skb_ext_type_len[id];
6278	ext->chunks = newlen;
6279	ext->offset[id] = newoff;
6280	skb->extensions = ext;
6281	skb->active_extensions = 1 << id;
6282	return skb_ext_get_ptr(ext, id);
6283}
6284
6285/**
6286 * skb_ext_add - allocate space for given extension, COW if needed
6287 * @skb: buffer
6288 * @id: extension to allocate space for
6289 *
6290 * Allocates enough space for the given extension.
6291 * If the extension is already present, a pointer to that extension
6292 * is returned.
6293 *
6294 * If the skb was cloned, COW applies and the returned memory can be
6295 * modified without changing the extension space of clones buffers.
6296 *
6297 * Returns pointer to the extension or NULL on allocation failure.
6298 */
6299void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id)
6300{
6301	struct skb_ext *new, *old = NULL;
6302	unsigned int newlen, newoff;
6303
6304	if (skb->active_extensions) {
6305		old = skb->extensions;
6306
6307		new = skb_ext_maybe_cow(old, skb->active_extensions);
6308		if (!new)
6309			return NULL;
6310
6311		if (__skb_ext_exist(new, id))
6312			goto set_active;
6313
6314		newoff = new->chunks;
6315	} else {
6316		newoff = SKB_EXT_CHUNKSIZEOF(*new);
6317
6318		new = __skb_ext_alloc(GFP_ATOMIC);
6319		if (!new)
6320			return NULL;
6321	}
6322
6323	newlen = newoff + skb_ext_type_len[id];
6324	new->chunks = newlen;
6325	new->offset[id] = newoff;
6326set_active:
6327	skb->extensions = new;
6328	skb->active_extensions |= 1 << id;
6329	return skb_ext_get_ptr(new, id);
6330}
6331EXPORT_SYMBOL(skb_ext_add);
6332
6333#ifdef CONFIG_XFRM
6334static void skb_ext_put_sp(struct sec_path *sp)
6335{
6336	unsigned int i;
6337
6338	for (i = 0; i < sp->len; i++)
6339		xfrm_state_put(sp->xvec[i]);
6340}
6341#endif
6342
6343void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id)
6344{
6345	struct skb_ext *ext = skb->extensions;
6346
6347	skb->active_extensions &= ~(1 << id);
6348	if (skb->active_extensions == 0) {
6349		skb->extensions = NULL;
6350		__skb_ext_put(ext);
6351#ifdef CONFIG_XFRM
6352	} else if (id == SKB_EXT_SEC_PATH &&
6353		   refcount_read(&ext->refcnt) == 1) {
6354		struct sec_path *sp = skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH);
6355
6356		skb_ext_put_sp(sp);
6357		sp->len = 0;
6358#endif
6359	}
6360}
6361EXPORT_SYMBOL(__skb_ext_del);
6362
6363void __skb_ext_put(struct skb_ext *ext)
6364{
6365	/* If this is last clone, nothing can increment
6366	 * it after check passes.  Avoids one atomic op.
6367	 */
6368	if (refcount_read(&ext->refcnt) == 1)
6369		goto free_now;
6370
6371	if (!refcount_dec_and_test(&ext->refcnt))
6372		return;
6373free_now:
6374#ifdef CONFIG_XFRM
6375	if (__skb_ext_exist(ext, SKB_EXT_SEC_PATH))
6376		skb_ext_put_sp(skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH));
6377#endif
6378
6379	kmem_cache_free(skbuff_ext_cache, ext);
6380}
6381EXPORT_SYMBOL(__skb_ext_put);
6382#endif /* CONFIG_SKB_EXTENSIONS */