at master 189 kB view raw
1// SPDX-License-Identifier: GPL-2.0-or-later 2/* 3 * Routines having to do with the 'struct sk_buff' memory handlers. 4 * 5 * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk> 6 * Florian La Roche <rzsfl@rz.uni-sb.de> 7 * 8 * Fixes: 9 * Alan Cox : Fixed the worst of the load 10 * balancer bugs. 11 * Dave Platt : Interrupt stacking fix. 12 * Richard Kooijman : Timestamp fixes. 13 * Alan Cox : Changed buffer format. 14 * Alan Cox : destructor hook for AF_UNIX etc. 15 * Linus Torvalds : Better skb_clone. 16 * Alan Cox : Added skb_copy. 17 * Alan Cox : Added all the changed routines Linus 18 * only put in the headers 19 * Ray VanTassle : Fixed --skb->lock in free 20 * Alan Cox : skb_copy copy arp field 21 * Andi Kleen : slabified it. 22 * Robert Olsson : Removed skb_head_pool 23 * 24 * NOTE: 25 * The __skb_ routines should be called with interrupts 26 * disabled, or you better be *real* sure that the operation is atomic 27 * with respect to whatever list is being frobbed (e.g. via lock_sock() 28 * or via disabling bottom half handlers, etc). 29 */ 30 31/* 32 * The functions in this file will not compile correctly with gcc 2.4.x 33 */ 34 35#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 36 37#include <linux/module.h> 38#include <linux/types.h> 39#include <linux/kernel.h> 40#include <linux/mm.h> 41#include <linux/interrupt.h> 42#include <linux/in.h> 43#include <linux/inet.h> 44#include <linux/slab.h> 45#include <linux/tcp.h> 46#include <linux/udp.h> 47#include <linux/sctp.h> 48#include <linux/netdevice.h> 49#ifdef CONFIG_NET_CLS_ACT 50#include <net/pkt_sched.h> 51#endif 52#include <linux/string.h> 53#include <linux/skbuff.h> 54#include <linux/skbuff_ref.h> 55#include <linux/splice.h> 56#include <linux/cache.h> 57#include <linux/rtnetlink.h> 58#include <linux/init.h> 59#include <linux/scatterlist.h> 60#include <linux/errqueue.h> 61#include <linux/prefetch.h> 62#include <linux/bitfield.h> 63#include <linux/if_vlan.h> 64#include <linux/mpls.h> 65#include <linux/kcov.h> 66#include <linux/iov_iter.h> 67#include <linux/crc32.h> 68 69#include <net/protocol.h> 70#include <net/dst.h> 71#include <net/sock.h> 72#include <net/checksum.h> 73#include <net/gro.h> 74#include <net/gso.h> 75#include <net/hotdata.h> 76#include <net/ip6_checksum.h> 77#include <net/xfrm.h> 78#include <net/mpls.h> 79#include <net/mptcp.h> 80#include <net/mctp.h> 81#include <net/page_pool/helpers.h> 82#include <net/psp/types.h> 83#include <net/dropreason.h> 84#include <net/xdp_sock.h> 85 86#include <linux/uaccess.h> 87#include <trace/events/skb.h> 88#include <linux/highmem.h> 89#include <linux/capability.h> 90#include <linux/user_namespace.h> 91#include <linux/indirect_call_wrapper.h> 92#include <linux/textsearch.h> 93 94#include "dev.h" 95#include "devmem.h" 96#include "netmem_priv.h" 97#include "sock_destructor.h" 98 99#ifdef CONFIG_SKB_EXTENSIONS 100static struct kmem_cache *skbuff_ext_cache __ro_after_init; 101#endif 102 103#define GRO_MAX_HEAD_PAD (GRO_MAX_HEAD + NET_SKB_PAD + NET_IP_ALIGN) 104#define SKB_SMALL_HEAD_SIZE SKB_HEAD_ALIGN(max(MAX_TCP_HEADER, \ 105 GRO_MAX_HEAD_PAD)) 106 107/* We want SKB_SMALL_HEAD_CACHE_SIZE to not be a power of two. 108 * This should ensure that SKB_SMALL_HEAD_HEADROOM is a unique 109 * size, and we can differentiate heads from skb_small_head_cache 110 * vs system slabs by looking at their size (skb_end_offset()). 111 */ 112#define SKB_SMALL_HEAD_CACHE_SIZE \ 113 (is_power_of_2(SKB_SMALL_HEAD_SIZE) ? \ 114 (SKB_SMALL_HEAD_SIZE + L1_CACHE_BYTES) : \ 115 SKB_SMALL_HEAD_SIZE) 116 117#define SKB_SMALL_HEAD_HEADROOM \ 118 SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) 119 120/* kcm_write_msgs() relies on casting paged frags to bio_vec to use 121 * iov_iter_bvec(). These static asserts ensure the cast is valid is long as the 122 * netmem is a page. 123 */ 124static_assert(offsetof(struct bio_vec, bv_page) == 125 offsetof(skb_frag_t, netmem)); 126static_assert(sizeof_field(struct bio_vec, bv_page) == 127 sizeof_field(skb_frag_t, netmem)); 128 129static_assert(offsetof(struct bio_vec, bv_len) == offsetof(skb_frag_t, len)); 130static_assert(sizeof_field(struct bio_vec, bv_len) == 131 sizeof_field(skb_frag_t, len)); 132 133static_assert(offsetof(struct bio_vec, bv_offset) == 134 offsetof(skb_frag_t, offset)); 135static_assert(sizeof_field(struct bio_vec, bv_offset) == 136 sizeof_field(skb_frag_t, offset)); 137 138#undef FN 139#define FN(reason) [SKB_DROP_REASON_##reason] = #reason, 140static const char * const drop_reasons[] = { 141 [SKB_CONSUMED] = "CONSUMED", 142 DEFINE_DROP_REASON(FN, FN) 143}; 144 145static const struct drop_reason_list drop_reasons_core = { 146 .reasons = drop_reasons, 147 .n_reasons = ARRAY_SIZE(drop_reasons), 148}; 149 150const struct drop_reason_list __rcu * 151drop_reasons_by_subsys[SKB_DROP_REASON_SUBSYS_NUM] = { 152 [SKB_DROP_REASON_SUBSYS_CORE] = RCU_INITIALIZER(&drop_reasons_core), 153}; 154EXPORT_SYMBOL(drop_reasons_by_subsys); 155 156/** 157 * drop_reasons_register_subsys - register another drop reason subsystem 158 * @subsys: the subsystem to register, must not be the core 159 * @list: the list of drop reasons within the subsystem, must point to 160 * a statically initialized list 161 */ 162void drop_reasons_register_subsys(enum skb_drop_reason_subsys subsys, 163 const struct drop_reason_list *list) 164{ 165 if (WARN(subsys <= SKB_DROP_REASON_SUBSYS_CORE || 166 subsys >= ARRAY_SIZE(drop_reasons_by_subsys), 167 "invalid subsystem %d\n", subsys)) 168 return; 169 170 /* must point to statically allocated memory, so INIT is OK */ 171 RCU_INIT_POINTER(drop_reasons_by_subsys[subsys], list); 172} 173EXPORT_SYMBOL_GPL(drop_reasons_register_subsys); 174 175/** 176 * drop_reasons_unregister_subsys - unregister a drop reason subsystem 177 * @subsys: the subsystem to remove, must not be the core 178 * 179 * Note: This will synchronize_rcu() to ensure no users when it returns. 180 */ 181void drop_reasons_unregister_subsys(enum skb_drop_reason_subsys subsys) 182{ 183 if (WARN(subsys <= SKB_DROP_REASON_SUBSYS_CORE || 184 subsys >= ARRAY_SIZE(drop_reasons_by_subsys), 185 "invalid subsystem %d\n", subsys)) 186 return; 187 188 RCU_INIT_POINTER(drop_reasons_by_subsys[subsys], NULL); 189 190 synchronize_rcu(); 191} 192EXPORT_SYMBOL_GPL(drop_reasons_unregister_subsys); 193 194/** 195 * skb_panic - private function for out-of-line support 196 * @skb: buffer 197 * @sz: size 198 * @addr: address 199 * @msg: skb_over_panic or skb_under_panic 200 * 201 * Out-of-line support for skb_put() and skb_push(). 202 * Called via the wrapper skb_over_panic() or skb_under_panic(). 203 * Keep out of line to prevent kernel bloat. 204 * __builtin_return_address is not used because it is not always reliable. 205 */ 206static void skb_panic(struct sk_buff *skb, unsigned int sz, void *addr, 207 const char msg[]) 208{ 209 pr_emerg("%s: text:%px len:%d put:%d head:%px data:%px tail:%#lx end:%#lx dev:%s\n", 210 msg, addr, skb->len, sz, skb->head, skb->data, 211 (unsigned long)skb->tail, (unsigned long)skb->end, 212 skb->dev ? skb->dev->name : "<NULL>"); 213 BUG(); 214} 215 216static void skb_over_panic(struct sk_buff *skb, unsigned int sz, void *addr) 217{ 218 skb_panic(skb, sz, addr, __func__); 219} 220 221static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr) 222{ 223 skb_panic(skb, sz, addr, __func__); 224} 225 226#define NAPI_SKB_CACHE_SIZE 128 227#define NAPI_SKB_CACHE_BULK 32 228#define NAPI_SKB_CACHE_FREE 32 229 230struct napi_alloc_cache { 231 local_lock_t bh_lock; 232 struct page_frag_cache page; 233 unsigned int skb_count; 234 void *skb_cache[NAPI_SKB_CACHE_SIZE]; 235}; 236 237static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache); 238static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache) = { 239 .bh_lock = INIT_LOCAL_LOCK(bh_lock), 240}; 241 242void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) 243{ 244 struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); 245 void *data; 246 247 fragsz = SKB_DATA_ALIGN(fragsz); 248 249 local_lock_nested_bh(&napi_alloc_cache.bh_lock); 250 data = __page_frag_alloc_align(&nc->page, fragsz, 251 GFP_ATOMIC | __GFP_NOWARN, align_mask); 252 local_unlock_nested_bh(&napi_alloc_cache.bh_lock); 253 return data; 254 255} 256EXPORT_SYMBOL(__napi_alloc_frag_align); 257 258void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) 259{ 260 void *data; 261 262 if (in_hardirq() || irqs_disabled()) { 263 struct page_frag_cache *nc = this_cpu_ptr(&netdev_alloc_cache); 264 265 fragsz = SKB_DATA_ALIGN(fragsz); 266 data = __page_frag_alloc_align(nc, fragsz, 267 GFP_ATOMIC | __GFP_NOWARN, 268 align_mask); 269 } else { 270 local_bh_disable(); 271 data = __napi_alloc_frag_align(fragsz, align_mask); 272 local_bh_enable(); 273 } 274 return data; 275} 276EXPORT_SYMBOL(__netdev_alloc_frag_align); 277 278/* Cache kmem_cache_size(net_hotdata.skbuff_cache) to help the compiler 279 * remove dead code (and skbuff_cache_size) when CONFIG_KASAN is unset. 280 */ 281static u32 skbuff_cache_size __read_mostly; 282 283static struct sk_buff *napi_skb_cache_get(bool alloc) 284{ 285 struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); 286 struct sk_buff *skb; 287 288 local_lock_nested_bh(&napi_alloc_cache.bh_lock); 289 if (unlikely(!nc->skb_count)) { 290 if (alloc) 291 nc->skb_count = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache, 292 GFP_ATOMIC | __GFP_NOWARN, 293 NAPI_SKB_CACHE_BULK, 294 nc->skb_cache); 295 if (unlikely(!nc->skb_count)) { 296 local_unlock_nested_bh(&napi_alloc_cache.bh_lock); 297 return NULL; 298 } 299 } 300 301 skb = nc->skb_cache[--nc->skb_count]; 302 if (nc->skb_count) 303 prefetch(nc->skb_cache[nc->skb_count - 1]); 304 local_unlock_nested_bh(&napi_alloc_cache.bh_lock); 305 kasan_mempool_unpoison_object(skb, skbuff_cache_size); 306 307 return skb; 308} 309 310/** 311 * napi_skb_cache_get_bulk - obtain a number of zeroed skb heads from the cache 312 * @skbs: pointer to an at least @n-sized array to fill with skb pointers 313 * @n: number of entries to provide 314 * 315 * Tries to obtain @n &sk_buff entries from the NAPI percpu cache and writes 316 * the pointers into the provided array @skbs. If there are less entries 317 * available, tries to replenish the cache and bulk-allocates the diff from 318 * the MM layer if needed. 319 * The heads are being zeroed with either memset() or %__GFP_ZERO, so they are 320 * ready for {,__}build_skb_around() and don't have any data buffers attached. 321 * Must be called *only* from the BH context. 322 * 323 * Return: number of successfully allocated skbs (@n if no actual allocation 324 * needed or kmem_cache_alloc_bulk() didn't fail). 325 */ 326u32 napi_skb_cache_get_bulk(void **skbs, u32 n) 327{ 328 struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); 329 u32 bulk, total = n; 330 331 local_lock_nested_bh(&napi_alloc_cache.bh_lock); 332 333 if (nc->skb_count >= n) 334 goto get; 335 336 /* No enough cached skbs. Try refilling the cache first */ 337 bulk = min(NAPI_SKB_CACHE_SIZE - nc->skb_count, NAPI_SKB_CACHE_BULK); 338 nc->skb_count += kmem_cache_alloc_bulk(net_hotdata.skbuff_cache, 339 GFP_ATOMIC | __GFP_NOWARN, bulk, 340 &nc->skb_cache[nc->skb_count]); 341 if (likely(nc->skb_count >= n)) 342 goto get; 343 344 /* Still not enough. Bulk-allocate the missing part directly, zeroed */ 345 n -= kmem_cache_alloc_bulk(net_hotdata.skbuff_cache, 346 GFP_ATOMIC | __GFP_ZERO | __GFP_NOWARN, 347 n - nc->skb_count, &skbs[nc->skb_count]); 348 if (likely(nc->skb_count >= n)) 349 goto get; 350 351 /* kmem_cache didn't allocate the number we need, limit the output */ 352 total -= n - nc->skb_count; 353 n = nc->skb_count; 354 355get: 356 for (u32 base = nc->skb_count - n, i = 0; i < n; i++) { 357 skbs[i] = nc->skb_cache[base + i]; 358 359 kasan_mempool_unpoison_object(skbs[i], skbuff_cache_size); 360 memset(skbs[i], 0, offsetof(struct sk_buff, tail)); 361 } 362 363 nc->skb_count -= n; 364 local_unlock_nested_bh(&napi_alloc_cache.bh_lock); 365 366 return total; 367} 368EXPORT_SYMBOL_GPL(napi_skb_cache_get_bulk); 369 370static inline void __finalize_skb_around(struct sk_buff *skb, void *data, 371 unsigned int size) 372{ 373 struct skb_shared_info *shinfo; 374 375 size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 376 377 /* Assumes caller memset cleared SKB */ 378 skb->truesize = SKB_TRUESIZE(size); 379 refcount_set(&skb->users, 1); 380 skb->head = data; 381 skb->data = data; 382 skb_reset_tail_pointer(skb); 383 skb_set_end_offset(skb, size); 384 skb->mac_header = (typeof(skb->mac_header))~0U; 385 skb->transport_header = (typeof(skb->transport_header))~0U; 386 skb->alloc_cpu = raw_smp_processor_id(); 387 /* make sure we initialize shinfo sequentially */ 388 shinfo = skb_shinfo(skb); 389 memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); 390 atomic_set(&shinfo->dataref, 1); 391 392 skb_set_kcov_handle(skb, kcov_common_handle()); 393} 394 395static inline void *__slab_build_skb(void *data, unsigned int *size) 396{ 397 void *resized; 398 399 /* Must find the allocation size (and grow it to match). */ 400 *size = ksize(data); 401 /* krealloc() will immediately return "data" when 402 * "ksize(data)" is requested: it is the existing upper 403 * bounds. As a result, GFP_ATOMIC will be ignored. Note 404 * that this "new" pointer needs to be passed back to the 405 * caller for use so the __alloc_size hinting will be 406 * tracked correctly. 407 */ 408 resized = krealloc(data, *size, GFP_ATOMIC); 409 WARN_ON_ONCE(resized != data); 410 return resized; 411} 412 413/* build_skb() variant which can operate on slab buffers. 414 * Note that this should be used sparingly as slab buffers 415 * cannot be combined efficiently by GRO! 416 */ 417struct sk_buff *slab_build_skb(void *data) 418{ 419 struct sk_buff *skb; 420 unsigned int size; 421 422 skb = kmem_cache_alloc(net_hotdata.skbuff_cache, 423 GFP_ATOMIC | __GFP_NOWARN); 424 if (unlikely(!skb)) 425 return NULL; 426 427 memset(skb, 0, offsetof(struct sk_buff, tail)); 428 data = __slab_build_skb(data, &size); 429 __finalize_skb_around(skb, data, size); 430 431 return skb; 432} 433EXPORT_SYMBOL(slab_build_skb); 434 435/* Caller must provide SKB that is memset cleared */ 436static void __build_skb_around(struct sk_buff *skb, void *data, 437 unsigned int frag_size) 438{ 439 unsigned int size = frag_size; 440 441 /* frag_size == 0 is considered deprecated now. Callers 442 * using slab buffer should use slab_build_skb() instead. 443 */ 444 if (WARN_ONCE(size == 0, "Use slab_build_skb() instead")) 445 data = __slab_build_skb(data, &size); 446 447 __finalize_skb_around(skb, data, size); 448} 449 450/** 451 * __build_skb - build a network buffer 452 * @data: data buffer provided by caller 453 * @frag_size: size of data (must not be 0) 454 * 455 * Allocate a new &sk_buff. Caller provides space holding head and 456 * skb_shared_info. @data must have been allocated from the page 457 * allocator or vmalloc(). (A @frag_size of 0 to indicate a kmalloc() 458 * allocation is deprecated, and callers should use slab_build_skb() 459 * instead.) 460 * The return is the new skb buffer. 461 * On a failure the return is %NULL, and @data is not freed. 462 * Notes : 463 * Before IO, driver allocates only data buffer where NIC put incoming frame 464 * Driver should add room at head (NET_SKB_PAD) and 465 * MUST add room at tail (SKB_DATA_ALIGN(skb_shared_info)) 466 * After IO, driver calls build_skb(), to allocate sk_buff and populate it 467 * before giving packet to stack. 468 * RX rings only contains data buffers, not full skbs. 469 */ 470struct sk_buff *__build_skb(void *data, unsigned int frag_size) 471{ 472 struct sk_buff *skb; 473 474 skb = kmem_cache_alloc(net_hotdata.skbuff_cache, 475 GFP_ATOMIC | __GFP_NOWARN); 476 if (unlikely(!skb)) 477 return NULL; 478 479 memset(skb, 0, offsetof(struct sk_buff, tail)); 480 __build_skb_around(skb, data, frag_size); 481 482 return skb; 483} 484 485/* build_skb() is wrapper over __build_skb(), that specifically 486 * takes care of skb->head and skb->pfmemalloc 487 */ 488struct sk_buff *build_skb(void *data, unsigned int frag_size) 489{ 490 struct sk_buff *skb = __build_skb(data, frag_size); 491 492 if (likely(skb && frag_size)) { 493 skb->head_frag = 1; 494 skb_propagate_pfmemalloc(virt_to_head_page(data), skb); 495 } 496 return skb; 497} 498EXPORT_SYMBOL(build_skb); 499 500/** 501 * build_skb_around - build a network buffer around provided skb 502 * @skb: sk_buff provide by caller, must be memset cleared 503 * @data: data buffer provided by caller 504 * @frag_size: size of data 505 */ 506struct sk_buff *build_skb_around(struct sk_buff *skb, 507 void *data, unsigned int frag_size) 508{ 509 if (unlikely(!skb)) 510 return NULL; 511 512 __build_skb_around(skb, data, frag_size); 513 514 if (frag_size) { 515 skb->head_frag = 1; 516 skb_propagate_pfmemalloc(virt_to_head_page(data), skb); 517 } 518 return skb; 519} 520EXPORT_SYMBOL(build_skb_around); 521 522/** 523 * __napi_build_skb - build a network buffer 524 * @data: data buffer provided by caller 525 * @frag_size: size of data 526 * 527 * Version of __build_skb() that uses NAPI percpu caches to obtain 528 * skbuff_head instead of inplace allocation. 529 * 530 * Returns a new &sk_buff on success, %NULL on allocation failure. 531 */ 532static struct sk_buff *__napi_build_skb(void *data, unsigned int frag_size) 533{ 534 struct sk_buff *skb; 535 536 skb = napi_skb_cache_get(true); 537 if (unlikely(!skb)) 538 return NULL; 539 540 memset(skb, 0, offsetof(struct sk_buff, tail)); 541 __build_skb_around(skb, data, frag_size); 542 543 return skb; 544} 545 546/** 547 * napi_build_skb - build a network buffer 548 * @data: data buffer provided by caller 549 * @frag_size: size of data 550 * 551 * Version of __napi_build_skb() that takes care of skb->head_frag 552 * and skb->pfmemalloc when the data is a page or page fragment. 553 * 554 * Returns a new &sk_buff on success, %NULL on allocation failure. 555 */ 556struct sk_buff *napi_build_skb(void *data, unsigned int frag_size) 557{ 558 struct sk_buff *skb = __napi_build_skb(data, frag_size); 559 560 if (likely(skb) && frag_size) { 561 skb->head_frag = 1; 562 skb_propagate_pfmemalloc(virt_to_head_page(data), skb); 563 } 564 565 return skb; 566} 567EXPORT_SYMBOL(napi_build_skb); 568 569/* 570 * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells 571 * the caller if emergency pfmemalloc reserves are being used. If it is and 572 * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves 573 * may be used. Otherwise, the packet data may be discarded until enough 574 * memory is free 575 */ 576static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node, 577 bool *pfmemalloc) 578{ 579 bool ret_pfmemalloc = false; 580 size_t obj_size; 581 void *obj; 582 583 obj_size = SKB_HEAD_ALIGN(*size); 584 if (obj_size <= SKB_SMALL_HEAD_CACHE_SIZE && 585 !(flags & KMALLOC_NOT_NORMAL_BITS)) { 586 obj = kmem_cache_alloc_node(net_hotdata.skb_small_head_cache, 587 flags | __GFP_NOMEMALLOC | __GFP_NOWARN, 588 node); 589 *size = SKB_SMALL_HEAD_CACHE_SIZE; 590 if (obj || !(gfp_pfmemalloc_allowed(flags))) 591 goto out; 592 /* Try again but now we are using pfmemalloc reserves */ 593 ret_pfmemalloc = true; 594 obj = kmem_cache_alloc_node(net_hotdata.skb_small_head_cache, flags, node); 595 goto out; 596 } 597 598 obj_size = kmalloc_size_roundup(obj_size); 599 /* The following cast might truncate high-order bits of obj_size, this 600 * is harmless because kmalloc(obj_size >= 2^32) will fail anyway. 601 */ 602 *size = (unsigned int)obj_size; 603 604 /* 605 * Try a regular allocation, when that fails and we're not entitled 606 * to the reserves, fail. 607 */ 608 obj = kmalloc_node_track_caller(obj_size, 609 flags | __GFP_NOMEMALLOC | __GFP_NOWARN, 610 node); 611 if (obj || !(gfp_pfmemalloc_allowed(flags))) 612 goto out; 613 614 /* Try again but now we are using pfmemalloc reserves */ 615 ret_pfmemalloc = true; 616 obj = kmalloc_node_track_caller(obj_size, flags, node); 617 618out: 619 if (pfmemalloc) 620 *pfmemalloc = ret_pfmemalloc; 621 622 return obj; 623} 624 625/* Allocate a new skbuff. We do this ourselves so we can fill in a few 626 * 'private' fields and also do memory statistics to find all the 627 * [BEEP] leaks. 628 * 629 */ 630 631/** 632 * __alloc_skb - allocate a network buffer 633 * @size: size to allocate 634 * @gfp_mask: allocation mask 635 * @flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache 636 * instead of head cache and allocate a cloned (child) skb. 637 * If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for 638 * allocations in case the data is required for writeback 639 * @node: numa node to allocate memory on 640 * 641 * Allocate a new &sk_buff. The returned buffer has no headroom and a 642 * tail room of at least size bytes. The object has a reference count 643 * of one. The return is the buffer. On a failure the return is %NULL. 644 * 645 * Buffers may only be allocated from interrupts using a @gfp_mask of 646 * %GFP_ATOMIC. 647 */ 648struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, 649 int flags, int node) 650{ 651 struct sk_buff *skb = NULL; 652 struct kmem_cache *cache; 653 bool pfmemalloc; 654 u8 *data; 655 656 if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX)) 657 gfp_mask |= __GFP_MEMALLOC; 658 659 if (flags & SKB_ALLOC_FCLONE) { 660 cache = net_hotdata.skbuff_fclone_cache; 661 goto fallback; 662 } 663 cache = net_hotdata.skbuff_cache; 664 if (unlikely(node != NUMA_NO_NODE && node != numa_mem_id())) 665 goto fallback; 666 667 if (flags & SKB_ALLOC_NAPI) { 668 skb = napi_skb_cache_get(true); 669 if (unlikely(!skb)) 670 return NULL; 671 } else if (!in_hardirq() && !irqs_disabled()) { 672 local_bh_disable(); 673 skb = napi_skb_cache_get(false); 674 local_bh_enable(); 675 } 676 677 if (!skb) { 678fallback: 679 skb = kmem_cache_alloc_node(cache, gfp_mask & ~GFP_DMA, node); 680 if (unlikely(!skb)) 681 return NULL; 682 } 683 prefetchw(skb); 684 685 /* We do our best to align skb_shared_info on a separate cache 686 * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives 687 * aligned memory blocks, unless SLUB/SLAB debug is enabled. 688 * Both skb->head and skb_shared_info are cache line aligned. 689 */ 690 data = kmalloc_reserve(&size, gfp_mask, node, &pfmemalloc); 691 if (unlikely(!data)) 692 goto nodata; 693 /* kmalloc_size_roundup() might give us more room than requested. 694 * Put skb_shared_info exactly at the end of allocated zone, 695 * to allow max possible filling before reallocation. 696 */ 697 prefetchw(data + SKB_WITH_OVERHEAD(size)); 698 699 /* 700 * Only clear those fields we need to clear, not those that we will 701 * actually initialise below. Hence, don't put any more fields after 702 * the tail pointer in struct sk_buff! 703 */ 704 memset(skb, 0, offsetof(struct sk_buff, tail)); 705 __build_skb_around(skb, data, size); 706 skb->pfmemalloc = pfmemalloc; 707 708 if (flags & SKB_ALLOC_FCLONE) { 709 struct sk_buff_fclones *fclones; 710 711 fclones = container_of(skb, struct sk_buff_fclones, skb1); 712 713 skb->fclone = SKB_FCLONE_ORIG; 714 refcount_set(&fclones->fclone_ref, 1); 715 } 716 717 return skb; 718 719nodata: 720 kmem_cache_free(cache, skb); 721 return NULL; 722} 723EXPORT_SYMBOL(__alloc_skb); 724 725/** 726 * __netdev_alloc_skb - allocate an skbuff for rx on a specific device 727 * @dev: network device to receive on 728 * @len: length to allocate 729 * @gfp_mask: get_free_pages mask, passed to alloc_skb 730 * 731 * Allocate a new &sk_buff and assign it a usage count of one. The 732 * buffer has NET_SKB_PAD headroom built in. Users should allocate 733 * the headroom they think they need without accounting for the 734 * built in space. The built in space is used for optimisations. 735 * 736 * %NULL is returned if there is no free memory. 737 */ 738struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, 739 gfp_t gfp_mask) 740{ 741 struct page_frag_cache *nc; 742 struct sk_buff *skb; 743 bool pfmemalloc; 744 void *data; 745 746 len += NET_SKB_PAD; 747 748 /* If requested length is either too small or too big, 749 * we use kmalloc() for skb->head allocation. 750 */ 751 if (len <= SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) || 752 len > SKB_WITH_OVERHEAD(PAGE_SIZE) || 753 (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { 754 skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE); 755 if (!skb) 756 goto skb_fail; 757 goto skb_success; 758 } 759 760 len = SKB_HEAD_ALIGN(len); 761 762 if (sk_memalloc_socks()) 763 gfp_mask |= __GFP_MEMALLOC; 764 765 if (in_hardirq() || irqs_disabled()) { 766 nc = this_cpu_ptr(&netdev_alloc_cache); 767 data = page_frag_alloc(nc, len, gfp_mask); 768 pfmemalloc = page_frag_cache_is_pfmemalloc(nc); 769 } else { 770 local_bh_disable(); 771 local_lock_nested_bh(&napi_alloc_cache.bh_lock); 772 773 nc = this_cpu_ptr(&napi_alloc_cache.page); 774 data = page_frag_alloc(nc, len, gfp_mask); 775 pfmemalloc = page_frag_cache_is_pfmemalloc(nc); 776 777 local_unlock_nested_bh(&napi_alloc_cache.bh_lock); 778 local_bh_enable(); 779 } 780 781 if (unlikely(!data)) 782 return NULL; 783 784 skb = __build_skb(data, len); 785 if (unlikely(!skb)) { 786 skb_free_frag(data); 787 return NULL; 788 } 789 790 if (pfmemalloc) 791 skb->pfmemalloc = 1; 792 skb->head_frag = 1; 793 794skb_success: 795 skb_reserve(skb, NET_SKB_PAD); 796 skb->dev = dev; 797 798skb_fail: 799 return skb; 800} 801EXPORT_SYMBOL(__netdev_alloc_skb); 802 803/** 804 * napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance 805 * @napi: napi instance this buffer was allocated for 806 * @len: length to allocate 807 * 808 * Allocate a new sk_buff for use in NAPI receive. This buffer will 809 * attempt to allocate the head from a special reserved region used 810 * only for NAPI Rx allocation. By doing this we can save several 811 * CPU cycles by avoiding having to disable and re-enable IRQs. 812 * 813 * %NULL is returned if there is no free memory. 814 */ 815struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len) 816{ 817 gfp_t gfp_mask = GFP_ATOMIC | __GFP_NOWARN; 818 struct napi_alloc_cache *nc; 819 struct sk_buff *skb; 820 bool pfmemalloc; 821 void *data; 822 823 DEBUG_NET_WARN_ON_ONCE(!in_softirq()); 824 len += NET_SKB_PAD + NET_IP_ALIGN; 825 826 /* If requested length is either too small or too big, 827 * we use kmalloc() for skb->head allocation. 828 */ 829 if (len <= SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) || 830 len > SKB_WITH_OVERHEAD(PAGE_SIZE) || 831 (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { 832 skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI, 833 NUMA_NO_NODE); 834 if (!skb) 835 goto skb_fail; 836 goto skb_success; 837 } 838 839 len = SKB_HEAD_ALIGN(len); 840 841 if (sk_memalloc_socks()) 842 gfp_mask |= __GFP_MEMALLOC; 843 844 local_lock_nested_bh(&napi_alloc_cache.bh_lock); 845 nc = this_cpu_ptr(&napi_alloc_cache); 846 847 data = page_frag_alloc(&nc->page, len, gfp_mask); 848 pfmemalloc = page_frag_cache_is_pfmemalloc(&nc->page); 849 local_unlock_nested_bh(&napi_alloc_cache.bh_lock); 850 851 if (unlikely(!data)) 852 return NULL; 853 854 skb = __napi_build_skb(data, len); 855 if (unlikely(!skb)) { 856 skb_free_frag(data); 857 return NULL; 858 } 859 860 if (pfmemalloc) 861 skb->pfmemalloc = 1; 862 skb->head_frag = 1; 863 864skb_success: 865 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN); 866 skb->dev = napi->dev; 867 868skb_fail: 869 return skb; 870} 871EXPORT_SYMBOL(napi_alloc_skb); 872 873void skb_add_rx_frag_netmem(struct sk_buff *skb, int i, netmem_ref netmem, 874 int off, int size, unsigned int truesize) 875{ 876 DEBUG_NET_WARN_ON_ONCE(size > truesize); 877 878 skb_fill_netmem_desc(skb, i, netmem, off, size); 879 skb->len += size; 880 skb->data_len += size; 881 skb->truesize += truesize; 882} 883EXPORT_SYMBOL(skb_add_rx_frag_netmem); 884 885void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size, 886 unsigned int truesize) 887{ 888 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 889 890 DEBUG_NET_WARN_ON_ONCE(size > truesize); 891 892 skb_frag_size_add(frag, size); 893 skb->len += size; 894 skb->data_len += size; 895 skb->truesize += truesize; 896} 897EXPORT_SYMBOL(skb_coalesce_rx_frag); 898 899static void skb_drop_list(struct sk_buff **listp) 900{ 901 kfree_skb_list(*listp); 902 *listp = NULL; 903} 904 905static inline void skb_drop_fraglist(struct sk_buff *skb) 906{ 907 skb_drop_list(&skb_shinfo(skb)->frag_list); 908} 909 910static void skb_clone_fraglist(struct sk_buff *skb) 911{ 912 struct sk_buff *list; 913 914 skb_walk_frags(skb, list) 915 skb_get(list); 916} 917 918int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb, 919 unsigned int headroom) 920{ 921#if IS_ENABLED(CONFIG_PAGE_POOL) 922 u32 size, truesize, len, max_head_size, off; 923 struct sk_buff *skb = *pskb, *nskb; 924 int err, i, head_off; 925 void *data; 926 927 /* XDP does not support fraglist so we need to linearize 928 * the skb. 929 */ 930 if (skb_has_frag_list(skb)) 931 return -EOPNOTSUPP; 932 933 max_head_size = SKB_WITH_OVERHEAD(PAGE_SIZE - headroom); 934 if (skb->len > max_head_size + MAX_SKB_FRAGS * PAGE_SIZE) 935 return -ENOMEM; 936 937 size = min_t(u32, skb->len, max_head_size); 938 truesize = SKB_HEAD_ALIGN(size) + headroom; 939 data = page_pool_dev_alloc_va(pool, &truesize); 940 if (!data) 941 return -ENOMEM; 942 943 nskb = napi_build_skb(data, truesize); 944 if (!nskb) { 945 page_pool_free_va(pool, data, true); 946 return -ENOMEM; 947 } 948 949 skb_reserve(nskb, headroom); 950 skb_copy_header(nskb, skb); 951 skb_mark_for_recycle(nskb); 952 953 err = skb_copy_bits(skb, 0, nskb->data, size); 954 if (err) { 955 consume_skb(nskb); 956 return err; 957 } 958 skb_put(nskb, size); 959 960 head_off = skb_headroom(nskb) - skb_headroom(skb); 961 skb_headers_offset_update(nskb, head_off); 962 963 off = size; 964 len = skb->len - off; 965 for (i = 0; i < MAX_SKB_FRAGS && off < skb->len; i++) { 966 struct page *page; 967 u32 page_off; 968 969 size = min_t(u32, len, PAGE_SIZE); 970 truesize = size; 971 972 page = page_pool_dev_alloc(pool, &page_off, &truesize); 973 if (!page) { 974 consume_skb(nskb); 975 return -ENOMEM; 976 } 977 978 skb_add_rx_frag(nskb, i, page, page_off, size, truesize); 979 err = skb_copy_bits(skb, off, page_address(page) + page_off, 980 size); 981 if (err) { 982 consume_skb(nskb); 983 return err; 984 } 985 986 len -= size; 987 off += size; 988 } 989 990 consume_skb(skb); 991 *pskb = nskb; 992 993 return 0; 994#else 995 return -EOPNOTSUPP; 996#endif 997} 998EXPORT_SYMBOL(skb_pp_cow_data); 999 1000int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb, 1001 const struct bpf_prog *prog) 1002{ 1003 if (!prog->aux->xdp_has_frags) 1004 return -EINVAL; 1005 1006 return skb_pp_cow_data(pool, pskb, XDP_PACKET_HEADROOM); 1007} 1008EXPORT_SYMBOL(skb_cow_data_for_xdp); 1009 1010#if IS_ENABLED(CONFIG_PAGE_POOL) 1011bool napi_pp_put_page(netmem_ref netmem) 1012{ 1013 netmem = netmem_compound_head(netmem); 1014 1015 if (unlikely(!netmem_is_pp(netmem))) 1016 return false; 1017 1018 page_pool_put_full_netmem(netmem_get_pp(netmem), netmem, false); 1019 1020 return true; 1021} 1022EXPORT_SYMBOL(napi_pp_put_page); 1023#endif 1024 1025static bool skb_pp_recycle(struct sk_buff *skb, void *data) 1026{ 1027 if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle) 1028 return false; 1029 return napi_pp_put_page(page_to_netmem(virt_to_page(data))); 1030} 1031 1032/** 1033 * skb_pp_frag_ref() - Increase fragment references of a page pool aware skb 1034 * @skb: page pool aware skb 1035 * 1036 * Increase the fragment reference count (pp_ref_count) of a skb. This is 1037 * intended to gain fragment references only for page pool aware skbs, 1038 * i.e. when skb->pp_recycle is true, and not for fragments in a 1039 * non-pp-recycling skb. It has a fallback to increase references on normal 1040 * pages, as page pool aware skbs may also have normal page fragments. 1041 */ 1042static int skb_pp_frag_ref(struct sk_buff *skb) 1043{ 1044 struct skb_shared_info *shinfo; 1045 netmem_ref head_netmem; 1046 int i; 1047 1048 if (!skb->pp_recycle) 1049 return -EINVAL; 1050 1051 shinfo = skb_shinfo(skb); 1052 1053 for (i = 0; i < shinfo->nr_frags; i++) { 1054 head_netmem = netmem_compound_head(shinfo->frags[i].netmem); 1055 if (likely(netmem_is_pp(head_netmem))) 1056 page_pool_ref_netmem(head_netmem); 1057 else 1058 page_ref_inc(netmem_to_page(head_netmem)); 1059 } 1060 return 0; 1061} 1062 1063static void skb_kfree_head(void *head, unsigned int end_offset) 1064{ 1065 if (end_offset == SKB_SMALL_HEAD_HEADROOM) 1066 kmem_cache_free(net_hotdata.skb_small_head_cache, head); 1067 else 1068 kfree(head); 1069} 1070 1071static void skb_free_head(struct sk_buff *skb) 1072{ 1073 unsigned char *head = skb->head; 1074 1075 if (skb->head_frag) { 1076 if (skb_pp_recycle(skb, head)) 1077 return; 1078 skb_free_frag(head); 1079 } else { 1080 skb_kfree_head(head, skb_end_offset(skb)); 1081 } 1082} 1083 1084static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason) 1085{ 1086 struct skb_shared_info *shinfo = skb_shinfo(skb); 1087 int i; 1088 1089 if (!skb_data_unref(skb, shinfo)) 1090 goto exit; 1091 1092 if (skb_zcopy(skb)) { 1093 bool skip_unref = shinfo->flags & SKBFL_MANAGED_FRAG_REFS; 1094 1095 skb_zcopy_clear(skb, true); 1096 if (skip_unref) 1097 goto free_head; 1098 } 1099 1100 for (i = 0; i < shinfo->nr_frags; i++) 1101 __skb_frag_unref(&shinfo->frags[i], skb->pp_recycle); 1102 1103free_head: 1104 if (shinfo->frag_list) 1105 kfree_skb_list_reason(shinfo->frag_list, reason); 1106 1107 skb_free_head(skb); 1108exit: 1109 /* When we clone an SKB we copy the reycling bit. The pp_recycle 1110 * bit is only set on the head though, so in order to avoid races 1111 * while trying to recycle fragments on __skb_frag_unref() we need 1112 * to make one SKB responsible for triggering the recycle path. 1113 * So disable the recycling bit if an SKB is cloned and we have 1114 * additional references to the fragmented part of the SKB. 1115 * Eventually the last SKB will have the recycling bit set and it's 1116 * dataref set to 0, which will trigger the recycling 1117 */ 1118 skb->pp_recycle = 0; 1119} 1120 1121/* 1122 * Free an skbuff by memory without cleaning the state. 1123 */ 1124static void kfree_skbmem(struct sk_buff *skb) 1125{ 1126 struct sk_buff_fclones *fclones; 1127 1128 switch (skb->fclone) { 1129 case SKB_FCLONE_UNAVAILABLE: 1130 kmem_cache_free(net_hotdata.skbuff_cache, skb); 1131 return; 1132 1133 case SKB_FCLONE_ORIG: 1134 fclones = container_of(skb, struct sk_buff_fclones, skb1); 1135 1136 /* We usually free the clone (TX completion) before original skb 1137 * This test would have no chance to be true for the clone, 1138 * while here, branch prediction will be good. 1139 */ 1140 if (refcount_read(&fclones->fclone_ref) == 1) 1141 goto fastpath; 1142 break; 1143 1144 default: /* SKB_FCLONE_CLONE */ 1145 fclones = container_of(skb, struct sk_buff_fclones, skb2); 1146 break; 1147 } 1148 if (!refcount_dec_and_test(&fclones->fclone_ref)) 1149 return; 1150fastpath: 1151 kmem_cache_free(net_hotdata.skbuff_fclone_cache, fclones); 1152} 1153 1154void skb_release_head_state(struct sk_buff *skb) 1155{ 1156 skb_dst_drop(skb); 1157 if (skb->destructor) { 1158 DEBUG_NET_WARN_ON_ONCE(in_hardirq()); 1159#ifdef CONFIG_INET 1160 INDIRECT_CALL_4(skb->destructor, 1161 tcp_wfree, __sock_wfree, sock_wfree, 1162 xsk_destruct_skb, 1163 skb); 1164#else 1165 INDIRECT_CALL_2(skb->destructor, 1166 sock_wfree, xsk_destruct_skb, 1167 skb); 1168 1169#endif 1170 skb->destructor = NULL; 1171 skb->sk = NULL; 1172 } 1173 nf_reset_ct(skb); 1174 skb_ext_reset(skb); 1175} 1176 1177/* Free everything but the sk_buff shell. */ 1178static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason) 1179{ 1180 skb_release_head_state(skb); 1181 if (likely(skb->head)) 1182 skb_release_data(skb, reason); 1183} 1184 1185/** 1186 * __kfree_skb - private function 1187 * @skb: buffer 1188 * 1189 * Free an sk_buff. Release anything attached to the buffer. 1190 * Clean the state. This is an internal helper function. Users should 1191 * always call kfree_skb 1192 */ 1193 1194void __kfree_skb(struct sk_buff *skb) 1195{ 1196 skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED); 1197 kfree_skbmem(skb); 1198} 1199EXPORT_SYMBOL(__kfree_skb); 1200 1201static __always_inline 1202bool __sk_skb_reason_drop(struct sock *sk, struct sk_buff *skb, 1203 enum skb_drop_reason reason) 1204{ 1205 if (unlikely(!skb_unref(skb))) 1206 return false; 1207 1208 DEBUG_NET_WARN_ON_ONCE(reason == SKB_NOT_DROPPED_YET || 1209 u32_get_bits(reason, 1210 SKB_DROP_REASON_SUBSYS_MASK) >= 1211 SKB_DROP_REASON_SUBSYS_NUM); 1212 1213 if (reason == SKB_CONSUMED) 1214 trace_consume_skb(skb, __builtin_return_address(0)); 1215 else 1216 trace_kfree_skb(skb, __builtin_return_address(0), reason, sk); 1217 return true; 1218} 1219 1220/** 1221 * sk_skb_reason_drop - free an sk_buff with special reason 1222 * @sk: the socket to receive @skb, or NULL if not applicable 1223 * @skb: buffer to free 1224 * @reason: reason why this skb is dropped 1225 * 1226 * Drop a reference to the buffer and free it if the usage count has hit 1227 * zero. Meanwhile, pass the receiving socket and drop reason to 1228 * 'kfree_skb' tracepoint. 1229 */ 1230void __fix_address 1231sk_skb_reason_drop(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason reason) 1232{ 1233 if (__sk_skb_reason_drop(sk, skb, reason)) 1234 __kfree_skb(skb); 1235} 1236EXPORT_SYMBOL(sk_skb_reason_drop); 1237 1238#define KFREE_SKB_BULK_SIZE 16 1239 1240struct skb_free_array { 1241 unsigned int skb_count; 1242 void *skb_array[KFREE_SKB_BULK_SIZE]; 1243}; 1244 1245static void kfree_skb_add_bulk(struct sk_buff *skb, 1246 struct skb_free_array *sa, 1247 enum skb_drop_reason reason) 1248{ 1249 /* if SKB is a clone, don't handle this case */ 1250 if (unlikely(skb->fclone != SKB_FCLONE_UNAVAILABLE)) { 1251 __kfree_skb(skb); 1252 return; 1253 } 1254 1255 skb_release_all(skb, reason); 1256 sa->skb_array[sa->skb_count++] = skb; 1257 1258 if (unlikely(sa->skb_count == KFREE_SKB_BULK_SIZE)) { 1259 kmem_cache_free_bulk(net_hotdata.skbuff_cache, KFREE_SKB_BULK_SIZE, 1260 sa->skb_array); 1261 sa->skb_count = 0; 1262 } 1263} 1264 1265void __fix_address 1266kfree_skb_list_reason(struct sk_buff *segs, enum skb_drop_reason reason) 1267{ 1268 struct skb_free_array sa; 1269 1270 sa.skb_count = 0; 1271 1272 while (segs) { 1273 struct sk_buff *next = segs->next; 1274 1275 if (__sk_skb_reason_drop(NULL, segs, reason)) { 1276 skb_poison_list(segs); 1277 kfree_skb_add_bulk(segs, &sa, reason); 1278 } 1279 1280 segs = next; 1281 } 1282 1283 if (sa.skb_count) 1284 kmem_cache_free_bulk(net_hotdata.skbuff_cache, sa.skb_count, sa.skb_array); 1285} 1286EXPORT_SYMBOL(kfree_skb_list_reason); 1287 1288/* Dump skb information and contents. 1289 * 1290 * Must only be called from net_ratelimit()-ed paths. 1291 * 1292 * Dumps whole packets if full_pkt, only headers otherwise. 1293 */ 1294void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt) 1295{ 1296 struct skb_shared_info *sh = skb_shinfo(skb); 1297 struct net_device *dev = skb->dev; 1298 struct sock *sk = skb->sk; 1299 struct sk_buff *list_skb; 1300 bool has_mac, has_trans; 1301 int headroom, tailroom; 1302 int i, len, seg_len; 1303 1304 if (full_pkt) 1305 len = skb->len; 1306 else 1307 len = min_t(int, skb->len, MAX_HEADER + 128); 1308 1309 headroom = skb_headroom(skb); 1310 tailroom = skb_tailroom(skb); 1311 1312 has_mac = skb_mac_header_was_set(skb); 1313 has_trans = skb_transport_header_was_set(skb); 1314 1315 printk("%sskb len=%u headroom=%u headlen=%u tailroom=%u\n" 1316 "mac=(%d,%d) mac_len=%u net=(%d,%d) trans=%d\n" 1317 "shinfo(txflags=%u nr_frags=%u gso(size=%hu type=%u segs=%hu))\n" 1318 "csum(0x%x start=%u offset=%u ip_summed=%u complete_sw=%u valid=%u level=%u)\n" 1319 "hash(0x%x sw=%u l4=%u) proto=0x%04x pkttype=%u iif=%d\n" 1320 "priority=0x%x mark=0x%x alloc_cpu=%u vlan_all=0x%x\n" 1321 "encapsulation=%d inner(proto=0x%04x, mac=%u, net=%u, trans=%u)\n", 1322 level, skb->len, headroom, skb_headlen(skb), tailroom, 1323 has_mac ? skb->mac_header : -1, 1324 has_mac ? skb_mac_header_len(skb) : -1, 1325 skb->mac_len, 1326 skb->network_header, 1327 has_trans ? skb_network_header_len(skb) : -1, 1328 has_trans ? skb->transport_header : -1, 1329 sh->tx_flags, sh->nr_frags, 1330 sh->gso_size, sh->gso_type, sh->gso_segs, 1331 skb->csum, skb->csum_start, skb->csum_offset, skb->ip_summed, 1332 skb->csum_complete_sw, skb->csum_valid, skb->csum_level, 1333 skb->hash, skb->sw_hash, skb->l4_hash, 1334 ntohs(skb->protocol), skb->pkt_type, skb->skb_iif, 1335 skb->priority, skb->mark, skb->alloc_cpu, skb->vlan_all, 1336 skb->encapsulation, skb->inner_protocol, skb->inner_mac_header, 1337 skb->inner_network_header, skb->inner_transport_header); 1338 1339 if (dev) 1340 printk("%sdev name=%s feat=%pNF\n", 1341 level, dev->name, &dev->features); 1342 if (sk) 1343 printk("%ssk family=%hu type=%u proto=%u\n", 1344 level, sk->sk_family, sk->sk_type, sk->sk_protocol); 1345 1346 if (full_pkt && headroom) 1347 print_hex_dump(level, "skb headroom: ", DUMP_PREFIX_OFFSET, 1348 16, 1, skb->head, headroom, false); 1349 1350 seg_len = min_t(int, skb_headlen(skb), len); 1351 if (seg_len) 1352 print_hex_dump(level, "skb linear: ", DUMP_PREFIX_OFFSET, 1353 16, 1, skb->data, seg_len, false); 1354 len -= seg_len; 1355 1356 if (full_pkt && tailroom) 1357 print_hex_dump(level, "skb tailroom: ", DUMP_PREFIX_OFFSET, 1358 16, 1, skb_tail_pointer(skb), tailroom, false); 1359 1360 for (i = 0; len && i < skb_shinfo(skb)->nr_frags; i++) { 1361 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 1362 u32 p_off, p_len, copied; 1363 struct page *p; 1364 u8 *vaddr; 1365 1366 if (skb_frag_is_net_iov(frag)) { 1367 printk("%sskb frag %d: not readable\n", level, i); 1368 len -= skb_frag_size(frag); 1369 if (!len) 1370 break; 1371 continue; 1372 } 1373 1374 skb_frag_foreach_page(frag, skb_frag_off(frag), 1375 skb_frag_size(frag), p, p_off, p_len, 1376 copied) { 1377 seg_len = min_t(int, p_len, len); 1378 vaddr = kmap_atomic(p); 1379 print_hex_dump(level, "skb frag: ", 1380 DUMP_PREFIX_OFFSET, 1381 16, 1, vaddr + p_off, seg_len, false); 1382 kunmap_atomic(vaddr); 1383 len -= seg_len; 1384 if (!len) 1385 break; 1386 } 1387 } 1388 1389 if (full_pkt && skb_has_frag_list(skb)) { 1390 printk("skb fraglist:\n"); 1391 skb_walk_frags(skb, list_skb) 1392 skb_dump(level, list_skb, true); 1393 } 1394} 1395EXPORT_SYMBOL(skb_dump); 1396 1397/** 1398 * skb_tx_error - report an sk_buff xmit error 1399 * @skb: buffer that triggered an error 1400 * 1401 * Report xmit error if a device callback is tracking this skb. 1402 * skb must be freed afterwards. 1403 */ 1404void skb_tx_error(struct sk_buff *skb) 1405{ 1406 if (skb) { 1407 skb_zcopy_downgrade_managed(skb); 1408 skb_zcopy_clear(skb, true); 1409 } 1410} 1411EXPORT_SYMBOL(skb_tx_error); 1412 1413#ifdef CONFIG_TRACEPOINTS 1414/** 1415 * consume_skb - free an skbuff 1416 * @skb: buffer to free 1417 * 1418 * Drop a ref to the buffer and free it if the usage count has hit zero 1419 * Functions identically to kfree_skb, but kfree_skb assumes that the frame 1420 * is being dropped after a failure and notes that 1421 */ 1422void consume_skb(struct sk_buff *skb) 1423{ 1424 if (!skb_unref(skb)) 1425 return; 1426 1427 trace_consume_skb(skb, __builtin_return_address(0)); 1428 __kfree_skb(skb); 1429} 1430EXPORT_SYMBOL(consume_skb); 1431#endif 1432 1433/** 1434 * __consume_stateless_skb - free an skbuff, assuming it is stateless 1435 * @skb: buffer to free 1436 * 1437 * Alike consume_skb(), but this variant assumes that this is the last 1438 * skb reference and all the head states have been already dropped 1439 */ 1440void __consume_stateless_skb(struct sk_buff *skb) 1441{ 1442 trace_consume_skb(skb, __builtin_return_address(0)); 1443 skb_release_data(skb, SKB_CONSUMED); 1444 kfree_skbmem(skb); 1445} 1446 1447static void napi_skb_cache_put(struct sk_buff *skb) 1448{ 1449 struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); 1450 1451 if (!kasan_mempool_poison_object(skb)) 1452 return; 1453 1454 local_lock_nested_bh(&napi_alloc_cache.bh_lock); 1455 nc->skb_cache[nc->skb_count++] = skb; 1456 1457 if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) { 1458 u32 i, remaining = NAPI_SKB_CACHE_SIZE - NAPI_SKB_CACHE_FREE; 1459 1460 for (i = remaining; i < NAPI_SKB_CACHE_SIZE; i++) 1461 kasan_mempool_unpoison_object(nc->skb_cache[i], 1462 skbuff_cache_size); 1463 1464 kmem_cache_free_bulk(net_hotdata.skbuff_cache, 1465 NAPI_SKB_CACHE_FREE, 1466 nc->skb_cache + remaining); 1467 nc->skb_count = remaining; 1468 } 1469 local_unlock_nested_bh(&napi_alloc_cache.bh_lock); 1470} 1471 1472void __napi_kfree_skb(struct sk_buff *skb, enum skb_drop_reason reason) 1473{ 1474 skb_release_all(skb, reason); 1475 napi_skb_cache_put(skb); 1476} 1477 1478void napi_skb_free_stolen_head(struct sk_buff *skb) 1479{ 1480 if (unlikely(skb->slow_gro)) { 1481 nf_reset_ct(skb); 1482 skb_dst_drop(skb); 1483 skb_ext_put(skb); 1484 skb_orphan(skb); 1485 skb->slow_gro = 0; 1486 } 1487 napi_skb_cache_put(skb); 1488} 1489 1490void napi_consume_skb(struct sk_buff *skb, int budget) 1491{ 1492 /* Zero budget indicate non-NAPI context called us, like netpoll */ 1493 if (unlikely(!budget || !skb)) { 1494 dev_consume_skb_any(skb); 1495 return; 1496 } 1497 1498 DEBUG_NET_WARN_ON_ONCE(!in_softirq()); 1499 1500 if (skb->alloc_cpu != smp_processor_id() && !skb_shared(skb)) { 1501 skb_release_head_state(skb); 1502 return skb_attempt_defer_free(skb); 1503 } 1504 1505 if (!skb_unref(skb)) 1506 return; 1507 1508 /* if reaching here SKB is ready to free */ 1509 trace_consume_skb(skb, __builtin_return_address(0)); 1510 1511 /* if SKB is a clone, don't handle this case */ 1512 if (skb->fclone != SKB_FCLONE_UNAVAILABLE) { 1513 __kfree_skb(skb); 1514 return; 1515 } 1516 1517 skb_release_all(skb, SKB_CONSUMED); 1518 napi_skb_cache_put(skb); 1519} 1520EXPORT_SYMBOL(napi_consume_skb); 1521 1522/* Make sure a field is contained by headers group */ 1523#define CHECK_SKB_FIELD(field) \ 1524 BUILD_BUG_ON(offsetof(struct sk_buff, field) != \ 1525 offsetof(struct sk_buff, headers.field)); \ 1526 1527static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) 1528{ 1529 new->tstamp = old->tstamp; 1530 /* We do not copy old->sk */ 1531 new->dev = old->dev; 1532 memcpy(new->cb, old->cb, sizeof(old->cb)); 1533 skb_dst_copy(new, old); 1534 __skb_ext_copy(new, old); 1535 __nf_copy(new, old, false); 1536 1537 /* Note : this field could be in the headers group. 1538 * It is not yet because we do not want to have a 16 bit hole 1539 */ 1540 new->queue_mapping = old->queue_mapping; 1541 1542 memcpy(&new->headers, &old->headers, sizeof(new->headers)); 1543 CHECK_SKB_FIELD(protocol); 1544 CHECK_SKB_FIELD(csum); 1545 CHECK_SKB_FIELD(hash); 1546 CHECK_SKB_FIELD(priority); 1547 CHECK_SKB_FIELD(skb_iif); 1548 CHECK_SKB_FIELD(vlan_proto); 1549 CHECK_SKB_FIELD(vlan_tci); 1550 CHECK_SKB_FIELD(transport_header); 1551 CHECK_SKB_FIELD(network_header); 1552 CHECK_SKB_FIELD(mac_header); 1553 CHECK_SKB_FIELD(inner_protocol); 1554 CHECK_SKB_FIELD(inner_transport_header); 1555 CHECK_SKB_FIELD(inner_network_header); 1556 CHECK_SKB_FIELD(inner_mac_header); 1557 CHECK_SKB_FIELD(mark); 1558#ifdef CONFIG_NETWORK_SECMARK 1559 CHECK_SKB_FIELD(secmark); 1560#endif 1561#ifdef CONFIG_NET_RX_BUSY_POLL 1562 CHECK_SKB_FIELD(napi_id); 1563#endif 1564 CHECK_SKB_FIELD(alloc_cpu); 1565#ifdef CONFIG_XPS 1566 CHECK_SKB_FIELD(sender_cpu); 1567#endif 1568#ifdef CONFIG_NET_SCHED 1569 CHECK_SKB_FIELD(tc_index); 1570#endif 1571 1572} 1573 1574/* 1575 * You should not add any new code to this function. Add it to 1576 * __copy_skb_header above instead. 1577 */ 1578static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb) 1579{ 1580#define C(x) n->x = skb->x 1581 1582 n->next = n->prev = NULL; 1583 n->sk = NULL; 1584 __copy_skb_header(n, skb); 1585 1586 C(len); 1587 C(data_len); 1588 C(mac_len); 1589 n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len; 1590 n->cloned = 1; 1591 n->nohdr = 0; 1592 n->peeked = 0; 1593 C(pfmemalloc); 1594 C(pp_recycle); 1595 n->destructor = NULL; 1596 C(tail); 1597 C(end); 1598 C(head); 1599 C(head_frag); 1600 C(data); 1601 C(truesize); 1602 refcount_set(&n->users, 1); 1603 1604 atomic_inc(&(skb_shinfo(skb)->dataref)); 1605 skb->cloned = 1; 1606 1607 return n; 1608#undef C 1609} 1610 1611/** 1612 * alloc_skb_for_msg() - allocate sk_buff to wrap frag list forming a msg 1613 * @first: first sk_buff of the msg 1614 */ 1615struct sk_buff *alloc_skb_for_msg(struct sk_buff *first) 1616{ 1617 struct sk_buff *n; 1618 1619 n = alloc_skb(0, GFP_ATOMIC); 1620 if (!n) 1621 return NULL; 1622 1623 n->len = first->len; 1624 n->data_len = first->len; 1625 n->truesize = first->truesize; 1626 1627 skb_shinfo(n)->frag_list = first; 1628 1629 __copy_skb_header(n, first); 1630 n->destructor = NULL; 1631 1632 return n; 1633} 1634EXPORT_SYMBOL_GPL(alloc_skb_for_msg); 1635 1636/** 1637 * skb_morph - morph one skb into another 1638 * @dst: the skb to receive the contents 1639 * @src: the skb to supply the contents 1640 * 1641 * This is identical to skb_clone except that the target skb is 1642 * supplied by the user. 1643 * 1644 * The target skb is returned upon exit. 1645 */ 1646struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src) 1647{ 1648 skb_release_all(dst, SKB_CONSUMED); 1649 return __skb_clone(dst, src); 1650} 1651EXPORT_SYMBOL_GPL(skb_morph); 1652 1653int mm_account_pinned_pages(struct mmpin *mmp, size_t size) 1654{ 1655 unsigned long max_pg, num_pg, new_pg, old_pg, rlim; 1656 struct user_struct *user; 1657 1658 if (capable(CAP_IPC_LOCK) || !size) 1659 return 0; 1660 1661 rlim = rlimit(RLIMIT_MEMLOCK); 1662 if (rlim == RLIM_INFINITY) 1663 return 0; 1664 1665 num_pg = (size >> PAGE_SHIFT) + 2; /* worst case */ 1666 max_pg = rlim >> PAGE_SHIFT; 1667 user = mmp->user ? : current_user(); 1668 1669 old_pg = atomic_long_read(&user->locked_vm); 1670 do { 1671 new_pg = old_pg + num_pg; 1672 if (new_pg > max_pg) 1673 return -ENOBUFS; 1674 } while (!atomic_long_try_cmpxchg(&user->locked_vm, &old_pg, new_pg)); 1675 1676 if (!mmp->user) { 1677 mmp->user = get_uid(user); 1678 mmp->num_pg = num_pg; 1679 } else { 1680 mmp->num_pg += num_pg; 1681 } 1682 1683 return 0; 1684} 1685EXPORT_SYMBOL_GPL(mm_account_pinned_pages); 1686 1687void mm_unaccount_pinned_pages(struct mmpin *mmp) 1688{ 1689 if (mmp->user) { 1690 atomic_long_sub(mmp->num_pg, &mmp->user->locked_vm); 1691 free_uid(mmp->user); 1692 } 1693} 1694EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages); 1695 1696static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size, 1697 bool devmem) 1698{ 1699 struct ubuf_info_msgzc *uarg; 1700 struct sk_buff *skb; 1701 1702 WARN_ON_ONCE(!in_task()); 1703 1704 skb = sock_omalloc(sk, 0, GFP_KERNEL); 1705 if (!skb) 1706 return NULL; 1707 1708 BUILD_BUG_ON(sizeof(*uarg) > sizeof(skb->cb)); 1709 uarg = (void *)skb->cb; 1710 uarg->mmp.user = NULL; 1711 1712 if (likely(!devmem) && mm_account_pinned_pages(&uarg->mmp, size)) { 1713 kfree_skb(skb); 1714 return NULL; 1715 } 1716 1717 uarg->ubuf.ops = &msg_zerocopy_ubuf_ops; 1718 uarg->id = ((u32)atomic_inc_return(&sk->sk_zckey)) - 1; 1719 uarg->len = 1; 1720 uarg->bytelen = size; 1721 uarg->zerocopy = 1; 1722 uarg->ubuf.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN; 1723 refcount_set(&uarg->ubuf.refcnt, 1); 1724 sock_hold(sk); 1725 1726 return &uarg->ubuf; 1727} 1728 1729static inline struct sk_buff *skb_from_uarg(struct ubuf_info_msgzc *uarg) 1730{ 1731 return container_of((void *)uarg, struct sk_buff, cb); 1732} 1733 1734struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, 1735 struct ubuf_info *uarg, bool devmem) 1736{ 1737 if (uarg) { 1738 struct ubuf_info_msgzc *uarg_zc; 1739 const u32 byte_limit = 1 << 19; /* limit to a few TSO */ 1740 u32 bytelen, next; 1741 1742 /* there might be non MSG_ZEROCOPY users */ 1743 if (uarg->ops != &msg_zerocopy_ubuf_ops) 1744 return NULL; 1745 1746 /* realloc only when socket is locked (TCP, UDP cork), 1747 * so uarg->len and sk_zckey access is serialized 1748 */ 1749 if (!sock_owned_by_user(sk)) { 1750 WARN_ON_ONCE(1); 1751 return NULL; 1752 } 1753 1754 uarg_zc = uarg_to_msgzc(uarg); 1755 bytelen = uarg_zc->bytelen + size; 1756 if (uarg_zc->len == USHRT_MAX - 1 || bytelen > byte_limit) { 1757 /* TCP can create new skb to attach new uarg */ 1758 if (sk->sk_type == SOCK_STREAM) 1759 goto new_alloc; 1760 return NULL; 1761 } 1762 1763 next = (u32)atomic_read(&sk->sk_zckey); 1764 if ((u32)(uarg_zc->id + uarg_zc->len) == next) { 1765 if (likely(!devmem) && 1766 mm_account_pinned_pages(&uarg_zc->mmp, size)) 1767 return NULL; 1768 uarg_zc->len++; 1769 uarg_zc->bytelen = bytelen; 1770 atomic_set(&sk->sk_zckey, ++next); 1771 1772 /* no extra ref when appending to datagram (MSG_MORE) */ 1773 if (sk->sk_type == SOCK_STREAM) 1774 net_zcopy_get(uarg); 1775 1776 return uarg; 1777 } 1778 } 1779 1780new_alloc: 1781 return msg_zerocopy_alloc(sk, size, devmem); 1782} 1783EXPORT_SYMBOL_GPL(msg_zerocopy_realloc); 1784 1785static bool skb_zerocopy_notify_extend(struct sk_buff *skb, u32 lo, u16 len) 1786{ 1787 struct sock_exterr_skb *serr = SKB_EXT_ERR(skb); 1788 u32 old_lo, old_hi; 1789 u64 sum_len; 1790 1791 old_lo = serr->ee.ee_info; 1792 old_hi = serr->ee.ee_data; 1793 sum_len = old_hi - old_lo + 1ULL + len; 1794 1795 if (sum_len >= (1ULL << 32)) 1796 return false; 1797 1798 if (lo != old_hi + 1) 1799 return false; 1800 1801 serr->ee.ee_data += len; 1802 return true; 1803} 1804 1805static void __msg_zerocopy_callback(struct ubuf_info_msgzc *uarg) 1806{ 1807 struct sk_buff *tail, *skb = skb_from_uarg(uarg); 1808 struct sock_exterr_skb *serr; 1809 struct sock *sk = skb->sk; 1810 struct sk_buff_head *q; 1811 unsigned long flags; 1812 bool is_zerocopy; 1813 u32 lo, hi; 1814 u16 len; 1815 1816 mm_unaccount_pinned_pages(&uarg->mmp); 1817 1818 /* if !len, there was only 1 call, and it was aborted 1819 * so do not queue a completion notification 1820 */ 1821 if (!uarg->len || sock_flag(sk, SOCK_DEAD)) 1822 goto release; 1823 1824 len = uarg->len; 1825 lo = uarg->id; 1826 hi = uarg->id + len - 1; 1827 is_zerocopy = uarg->zerocopy; 1828 1829 serr = SKB_EXT_ERR(skb); 1830 memset(serr, 0, sizeof(*serr)); 1831 serr->ee.ee_errno = 0; 1832 serr->ee.ee_origin = SO_EE_ORIGIN_ZEROCOPY; 1833 serr->ee.ee_data = hi; 1834 serr->ee.ee_info = lo; 1835 if (!is_zerocopy) 1836 serr->ee.ee_code |= SO_EE_CODE_ZEROCOPY_COPIED; 1837 1838 q = &sk->sk_error_queue; 1839 spin_lock_irqsave(&q->lock, flags); 1840 tail = skb_peek_tail(q); 1841 if (!tail || SKB_EXT_ERR(tail)->ee.ee_origin != SO_EE_ORIGIN_ZEROCOPY || 1842 !skb_zerocopy_notify_extend(tail, lo, len)) { 1843 __skb_queue_tail(q, skb); 1844 skb = NULL; 1845 } 1846 spin_unlock_irqrestore(&q->lock, flags); 1847 1848 sk_error_report(sk); 1849 1850release: 1851 consume_skb(skb); 1852 sock_put(sk); 1853} 1854 1855static void msg_zerocopy_complete(struct sk_buff *skb, struct ubuf_info *uarg, 1856 bool success) 1857{ 1858 struct ubuf_info_msgzc *uarg_zc = uarg_to_msgzc(uarg); 1859 1860 uarg_zc->zerocopy = uarg_zc->zerocopy & success; 1861 1862 if (refcount_dec_and_test(&uarg->refcnt)) 1863 __msg_zerocopy_callback(uarg_zc); 1864} 1865 1866void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref) 1867{ 1868 struct sock *sk = skb_from_uarg(uarg_to_msgzc(uarg))->sk; 1869 1870 atomic_dec(&sk->sk_zckey); 1871 uarg_to_msgzc(uarg)->len--; 1872 1873 if (have_uref) 1874 msg_zerocopy_complete(NULL, uarg, true); 1875} 1876EXPORT_SYMBOL_GPL(msg_zerocopy_put_abort); 1877 1878const struct ubuf_info_ops msg_zerocopy_ubuf_ops = { 1879 .complete = msg_zerocopy_complete, 1880}; 1881EXPORT_SYMBOL_GPL(msg_zerocopy_ubuf_ops); 1882 1883int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, 1884 struct msghdr *msg, int len, 1885 struct ubuf_info *uarg, 1886 struct net_devmem_dmabuf_binding *binding) 1887{ 1888 int err, orig_len = skb->len; 1889 1890 if (uarg->ops->link_skb) { 1891 err = uarg->ops->link_skb(skb, uarg); 1892 if (err) 1893 return err; 1894 } else { 1895 struct ubuf_info *orig_uarg = skb_zcopy(skb); 1896 1897 /* An skb can only point to one uarg. This edge case happens 1898 * when TCP appends to an skb, but zerocopy_realloc triggered 1899 * a new alloc. 1900 */ 1901 if (orig_uarg && uarg != orig_uarg) 1902 return -EEXIST; 1903 } 1904 1905 err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len, 1906 binding); 1907 if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) { 1908 struct sock *save_sk = skb->sk; 1909 1910 /* Streams do not free skb on error. Reset to prev state. */ 1911 iov_iter_revert(&msg->msg_iter, skb->len - orig_len); 1912 skb->sk = sk; 1913 ___pskb_trim(skb, orig_len); 1914 skb->sk = save_sk; 1915 return err; 1916 } 1917 1918 skb_zcopy_set(skb, uarg, NULL); 1919 return skb->len - orig_len; 1920} 1921EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream); 1922 1923void __skb_zcopy_downgrade_managed(struct sk_buff *skb) 1924{ 1925 int i; 1926 1927 skb_shinfo(skb)->flags &= ~SKBFL_MANAGED_FRAG_REFS; 1928 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) 1929 skb_frag_ref(skb, i); 1930} 1931EXPORT_SYMBOL_GPL(__skb_zcopy_downgrade_managed); 1932 1933static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig, 1934 gfp_t gfp_mask) 1935{ 1936 if (skb_zcopy(orig)) { 1937 if (skb_zcopy(nskb)) { 1938 /* !gfp_mask callers are verified to !skb_zcopy(nskb) */ 1939 if (!gfp_mask) { 1940 WARN_ON_ONCE(1); 1941 return -ENOMEM; 1942 } 1943 if (skb_uarg(nskb) == skb_uarg(orig)) 1944 return 0; 1945 if (skb_copy_ubufs(nskb, GFP_ATOMIC)) 1946 return -EIO; 1947 } 1948 skb_zcopy_set(nskb, skb_uarg(orig), NULL); 1949 } 1950 return 0; 1951} 1952 1953/** 1954 * skb_copy_ubufs - copy userspace skb frags buffers to kernel 1955 * @skb: the skb to modify 1956 * @gfp_mask: allocation priority 1957 * 1958 * This must be called on skb with SKBFL_ZEROCOPY_ENABLE. 1959 * It will copy all frags into kernel and drop the reference 1960 * to userspace pages. 1961 * 1962 * If this function is called from an interrupt gfp_mask() must be 1963 * %GFP_ATOMIC. 1964 * 1965 * Returns 0 on success or a negative error code on failure 1966 * to allocate kernel memory to copy to. 1967 */ 1968int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) 1969{ 1970 int num_frags = skb_shinfo(skb)->nr_frags; 1971 struct page *page, *head = NULL; 1972 int i, order, psize, new_frags; 1973 u32 d_off; 1974 1975 if (skb_shared(skb) || skb_unclone(skb, gfp_mask)) 1976 return -EINVAL; 1977 1978 if (!skb_frags_readable(skb)) 1979 return -EFAULT; 1980 1981 if (!num_frags) 1982 goto release; 1983 1984 /* We might have to allocate high order pages, so compute what minimum 1985 * page order is needed. 1986 */ 1987 order = 0; 1988 while ((PAGE_SIZE << order) * MAX_SKB_FRAGS < __skb_pagelen(skb)) 1989 order++; 1990 psize = (PAGE_SIZE << order); 1991 1992 new_frags = (__skb_pagelen(skb) + psize - 1) >> (PAGE_SHIFT + order); 1993 for (i = 0; i < new_frags; i++) { 1994 page = alloc_pages(gfp_mask | __GFP_COMP, order); 1995 if (!page) { 1996 while (head) { 1997 struct page *next = (struct page *)page_private(head); 1998 put_page(head); 1999 head = next; 2000 } 2001 return -ENOMEM; 2002 } 2003 set_page_private(page, (unsigned long)head); 2004 head = page; 2005 } 2006 2007 page = head; 2008 d_off = 0; 2009 for (i = 0; i < num_frags; i++) { 2010 skb_frag_t *f = &skb_shinfo(skb)->frags[i]; 2011 u32 p_off, p_len, copied; 2012 struct page *p; 2013 u8 *vaddr; 2014 2015 skb_frag_foreach_page(f, skb_frag_off(f), skb_frag_size(f), 2016 p, p_off, p_len, copied) { 2017 u32 copy, done = 0; 2018 vaddr = kmap_atomic(p); 2019 2020 while (done < p_len) { 2021 if (d_off == psize) { 2022 d_off = 0; 2023 page = (struct page *)page_private(page); 2024 } 2025 copy = min_t(u32, psize - d_off, p_len - done); 2026 memcpy(page_address(page) + d_off, 2027 vaddr + p_off + done, copy); 2028 done += copy; 2029 d_off += copy; 2030 } 2031 kunmap_atomic(vaddr); 2032 } 2033 } 2034 2035 /* skb frags release userspace buffers */ 2036 for (i = 0; i < num_frags; i++) 2037 skb_frag_unref(skb, i); 2038 2039 /* skb frags point to kernel buffers */ 2040 for (i = 0; i < new_frags - 1; i++) { 2041 __skb_fill_netmem_desc(skb, i, page_to_netmem(head), 0, psize); 2042 head = (struct page *)page_private(head); 2043 } 2044 __skb_fill_netmem_desc(skb, new_frags - 1, page_to_netmem(head), 0, 2045 d_off); 2046 skb_shinfo(skb)->nr_frags = new_frags; 2047 2048release: 2049 skb_zcopy_clear(skb, false); 2050 return 0; 2051} 2052EXPORT_SYMBOL_GPL(skb_copy_ubufs); 2053 2054/** 2055 * skb_clone - duplicate an sk_buff 2056 * @skb: buffer to clone 2057 * @gfp_mask: allocation priority 2058 * 2059 * Duplicate an &sk_buff. The new one is not owned by a socket. Both 2060 * copies share the same packet data but not structure. The new 2061 * buffer has a reference count of 1. If the allocation fails the 2062 * function returns %NULL otherwise the new buffer is returned. 2063 * 2064 * If this function is called from an interrupt gfp_mask() must be 2065 * %GFP_ATOMIC. 2066 */ 2067 2068struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) 2069{ 2070 struct sk_buff_fclones *fclones = container_of(skb, 2071 struct sk_buff_fclones, 2072 skb1); 2073 struct sk_buff *n; 2074 2075 if (skb_orphan_frags(skb, gfp_mask)) 2076 return NULL; 2077 2078 if (skb->fclone == SKB_FCLONE_ORIG && 2079 refcount_read(&fclones->fclone_ref) == 1) { 2080 n = &fclones->skb2; 2081 refcount_set(&fclones->fclone_ref, 2); 2082 n->fclone = SKB_FCLONE_CLONE; 2083 } else { 2084 if (skb_pfmemalloc(skb)) 2085 gfp_mask |= __GFP_MEMALLOC; 2086 2087 n = kmem_cache_alloc(net_hotdata.skbuff_cache, gfp_mask); 2088 if (!n) 2089 return NULL; 2090 2091 n->fclone = SKB_FCLONE_UNAVAILABLE; 2092 } 2093 2094 return __skb_clone(n, skb); 2095} 2096EXPORT_SYMBOL(skb_clone); 2097 2098void skb_headers_offset_update(struct sk_buff *skb, int off) 2099{ 2100 /* Only adjust this if it actually is csum_start rather than csum */ 2101 if (skb->ip_summed == CHECKSUM_PARTIAL) 2102 skb->csum_start += off; 2103 /* {transport,network,mac}_header and tail are relative to skb->head */ 2104 skb->transport_header += off; 2105 skb->network_header += off; 2106 if (skb_mac_header_was_set(skb)) 2107 skb->mac_header += off; 2108 skb->inner_transport_header += off; 2109 skb->inner_network_header += off; 2110 skb->inner_mac_header += off; 2111} 2112EXPORT_SYMBOL(skb_headers_offset_update); 2113 2114void skb_copy_header(struct sk_buff *new, const struct sk_buff *old) 2115{ 2116 __copy_skb_header(new, old); 2117 2118 skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size; 2119 skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs; 2120 skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type; 2121} 2122EXPORT_SYMBOL(skb_copy_header); 2123 2124static inline int skb_alloc_rx_flag(const struct sk_buff *skb) 2125{ 2126 if (skb_pfmemalloc(skb)) 2127 return SKB_ALLOC_RX; 2128 return 0; 2129} 2130 2131/** 2132 * skb_copy - create private copy of an sk_buff 2133 * @skb: buffer to copy 2134 * @gfp_mask: allocation priority 2135 * 2136 * Make a copy of both an &sk_buff and its data. This is used when the 2137 * caller wishes to modify the data and needs a private copy of the 2138 * data to alter. Returns %NULL on failure or the pointer to the buffer 2139 * on success. The returned buffer has a reference count of 1. 2140 * 2141 * As by-product this function converts non-linear &sk_buff to linear 2142 * one, so that &sk_buff becomes completely private and caller is allowed 2143 * to modify all the data of returned buffer. This means that this 2144 * function is not recommended for use in circumstances when only 2145 * header is going to be modified. Use pskb_copy() instead. 2146 */ 2147 2148struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask) 2149{ 2150 struct sk_buff *n; 2151 unsigned int size; 2152 int headerlen; 2153 2154 if (!skb_frags_readable(skb)) 2155 return NULL; 2156 2157 if (WARN_ON_ONCE(skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST)) 2158 return NULL; 2159 2160 headerlen = skb_headroom(skb); 2161 size = skb_end_offset(skb) + skb->data_len; 2162 n = __alloc_skb(size, gfp_mask, 2163 skb_alloc_rx_flag(skb), NUMA_NO_NODE); 2164 if (!n) 2165 return NULL; 2166 2167 /* Set the data pointer */ 2168 skb_reserve(n, headerlen); 2169 /* Set the tail pointer and length */ 2170 skb_put(n, skb->len); 2171 2172 BUG_ON(skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len)); 2173 2174 skb_copy_header(n, skb); 2175 return n; 2176} 2177EXPORT_SYMBOL(skb_copy); 2178 2179/** 2180 * __pskb_copy_fclone - create copy of an sk_buff with private head. 2181 * @skb: buffer to copy 2182 * @headroom: headroom of new skb 2183 * @gfp_mask: allocation priority 2184 * @fclone: if true allocate the copy of the skb from the fclone 2185 * cache instead of the head cache; it is recommended to set this 2186 * to true for the cases where the copy will likely be cloned 2187 * 2188 * Make a copy of both an &sk_buff and part of its data, located 2189 * in header. Fragmented data remain shared. This is used when 2190 * the caller wishes to modify only header of &sk_buff and needs 2191 * private copy of the header to alter. Returns %NULL on failure 2192 * or the pointer to the buffer on success. 2193 * The returned buffer has a reference count of 1. 2194 */ 2195 2196struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom, 2197 gfp_t gfp_mask, bool fclone) 2198{ 2199 unsigned int size = skb_headlen(skb) + headroom; 2200 int flags = skb_alloc_rx_flag(skb) | (fclone ? SKB_ALLOC_FCLONE : 0); 2201 struct sk_buff *n = __alloc_skb(size, gfp_mask, flags, NUMA_NO_NODE); 2202 2203 if (!n) 2204 goto out; 2205 2206 /* Set the data pointer */ 2207 skb_reserve(n, headroom); 2208 /* Set the tail pointer and length */ 2209 skb_put(n, skb_headlen(skb)); 2210 /* Copy the bytes */ 2211 skb_copy_from_linear_data(skb, n->data, n->len); 2212 2213 n->truesize += skb->data_len; 2214 n->data_len = skb->data_len; 2215 n->len = skb->len; 2216 2217 if (skb_shinfo(skb)->nr_frags) { 2218 int i; 2219 2220 if (skb_orphan_frags(skb, gfp_mask) || 2221 skb_zerocopy_clone(n, skb, gfp_mask)) { 2222 kfree_skb(n); 2223 n = NULL; 2224 goto out; 2225 } 2226 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 2227 skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i]; 2228 skb_frag_ref(skb, i); 2229 } 2230 skb_shinfo(n)->nr_frags = i; 2231 } 2232 2233 if (skb_has_frag_list(skb)) { 2234 skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list; 2235 skb_clone_fraglist(n); 2236 } 2237 2238 skb_copy_header(n, skb); 2239out: 2240 return n; 2241} 2242EXPORT_SYMBOL(__pskb_copy_fclone); 2243 2244/** 2245 * pskb_expand_head - reallocate header of &sk_buff 2246 * @skb: buffer to reallocate 2247 * @nhead: room to add at head 2248 * @ntail: room to add at tail 2249 * @gfp_mask: allocation priority 2250 * 2251 * Expands (or creates identical copy, if @nhead and @ntail are zero) 2252 * header of @skb. &sk_buff itself is not changed. &sk_buff MUST have 2253 * reference count of 1. Returns zero in the case of success or error, 2254 * if expansion failed. In the last case, &sk_buff is not changed. 2255 * 2256 * All the pointers pointing into skb header may change and must be 2257 * reloaded after call to this function. 2258 * 2259 * Note: If you skb_push() the start of the buffer after reallocating the 2260 * header, call skb_postpush_data_move() first to move the metadata out of 2261 * the way before writing to &sk_buff->data. 2262 */ 2263 2264int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, 2265 gfp_t gfp_mask) 2266{ 2267 unsigned int osize = skb_end_offset(skb); 2268 unsigned int size = osize + nhead + ntail; 2269 long off; 2270 u8 *data; 2271 int i; 2272 2273 BUG_ON(nhead < 0); 2274 2275 BUG_ON(skb_shared(skb)); 2276 2277 skb_zcopy_downgrade_managed(skb); 2278 2279 if (skb_pfmemalloc(skb)) 2280 gfp_mask |= __GFP_MEMALLOC; 2281 2282 data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL); 2283 if (!data) 2284 goto nodata; 2285 size = SKB_WITH_OVERHEAD(size); 2286 2287 /* Copy only real data... and, alas, header. This should be 2288 * optimized for the cases when header is void. 2289 */ 2290 memcpy(data + nhead, skb->head, skb_tail_pointer(skb) - skb->head); 2291 2292 memcpy((struct skb_shared_info *)(data + size), 2293 skb_shinfo(skb), 2294 offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags])); 2295 2296 /* 2297 * if shinfo is shared we must drop the old head gracefully, but if it 2298 * is not we can just drop the old head and let the existing refcount 2299 * be since all we did is relocate the values 2300 */ 2301 if (skb_cloned(skb)) { 2302 if (skb_orphan_frags(skb, gfp_mask)) 2303 goto nofrags; 2304 if (skb_zcopy(skb)) 2305 refcount_inc(&skb_uarg(skb)->refcnt); 2306 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) 2307 skb_frag_ref(skb, i); 2308 2309 if (skb_has_frag_list(skb)) 2310 skb_clone_fraglist(skb); 2311 2312 skb_release_data(skb, SKB_CONSUMED); 2313 } else { 2314 skb_free_head(skb); 2315 } 2316 off = (data + nhead) - skb->head; 2317 2318 skb->head = data; 2319 skb->head_frag = 0; 2320 skb->data += off; 2321 2322 skb_set_end_offset(skb, size); 2323#ifdef NET_SKBUFF_DATA_USES_OFFSET 2324 off = nhead; 2325#endif 2326 skb->tail += off; 2327 skb_headers_offset_update(skb, nhead); 2328 skb->cloned = 0; 2329 skb->hdr_len = 0; 2330 skb->nohdr = 0; 2331 atomic_set(&skb_shinfo(skb)->dataref, 1); 2332 2333 /* It is not generally safe to change skb->truesize. 2334 * For the moment, we really care of rx path, or 2335 * when skb is orphaned (not attached to a socket). 2336 */ 2337 if (!skb->sk || skb->destructor == sock_edemux) 2338 skb->truesize += size - osize; 2339 2340 return 0; 2341 2342nofrags: 2343 skb_kfree_head(data, size); 2344nodata: 2345 return -ENOMEM; 2346} 2347EXPORT_SYMBOL(pskb_expand_head); 2348 2349/* Make private copy of skb with writable head and some headroom */ 2350 2351struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom) 2352{ 2353 struct sk_buff *skb2; 2354 int delta = headroom - skb_headroom(skb); 2355 2356 if (delta <= 0) 2357 skb2 = pskb_copy(skb, GFP_ATOMIC); 2358 else { 2359 skb2 = skb_clone(skb, GFP_ATOMIC); 2360 if (skb2 && pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0, 2361 GFP_ATOMIC)) { 2362 kfree_skb(skb2); 2363 skb2 = NULL; 2364 } 2365 } 2366 return skb2; 2367} 2368EXPORT_SYMBOL(skb_realloc_headroom); 2369 2370/* Note: We plan to rework this in linux-6.4 */ 2371int __skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri) 2372{ 2373 unsigned int saved_end_offset, saved_truesize; 2374 struct skb_shared_info *shinfo; 2375 int res; 2376 2377 saved_end_offset = skb_end_offset(skb); 2378 saved_truesize = skb->truesize; 2379 2380 res = pskb_expand_head(skb, 0, 0, pri); 2381 if (res) 2382 return res; 2383 2384 skb->truesize = saved_truesize; 2385 2386 if (likely(skb_end_offset(skb) == saved_end_offset)) 2387 return 0; 2388 2389 /* We can not change skb->end if the original or new value 2390 * is SKB_SMALL_HEAD_HEADROOM, as it might break skb_kfree_head(). 2391 */ 2392 if (saved_end_offset == SKB_SMALL_HEAD_HEADROOM || 2393 skb_end_offset(skb) == SKB_SMALL_HEAD_HEADROOM) { 2394 /* We think this path should not be taken. 2395 * Add a temporary trace to warn us just in case. 2396 */ 2397 pr_err_once("__skb_unclone_keeptruesize() skb_end_offset() %u -> %u\n", 2398 saved_end_offset, skb_end_offset(skb)); 2399 WARN_ON_ONCE(1); 2400 return 0; 2401 } 2402 2403 shinfo = skb_shinfo(skb); 2404 2405 /* We are about to change back skb->end, 2406 * we need to move skb_shinfo() to its new location. 2407 */ 2408 memmove(skb->head + saved_end_offset, 2409 shinfo, 2410 offsetof(struct skb_shared_info, frags[shinfo->nr_frags])); 2411 2412 skb_set_end_offset(skb, saved_end_offset); 2413 2414 return 0; 2415} 2416 2417/** 2418 * skb_expand_head - reallocate header of &sk_buff 2419 * @skb: buffer to reallocate 2420 * @headroom: needed headroom 2421 * 2422 * Unlike skb_realloc_headroom, this one does not allocate a new skb 2423 * if possible; copies skb->sk to new skb as needed 2424 * and frees original skb in case of failures. 2425 * 2426 * It expect increased headroom and generates warning otherwise. 2427 */ 2428 2429struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom) 2430{ 2431 int delta = headroom - skb_headroom(skb); 2432 int osize = skb_end_offset(skb); 2433 struct sock *sk = skb->sk; 2434 2435 if (WARN_ONCE(delta <= 0, 2436 "%s is expecting an increase in the headroom", __func__)) 2437 return skb; 2438 2439 delta = SKB_DATA_ALIGN(delta); 2440 /* pskb_expand_head() might crash, if skb is shared. */ 2441 if (skb_shared(skb) || !is_skb_wmem(skb)) { 2442 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC); 2443 2444 if (unlikely(!nskb)) 2445 goto fail; 2446 2447 if (sk) 2448 skb_set_owner_w(nskb, sk); 2449 consume_skb(skb); 2450 skb = nskb; 2451 } 2452 if (pskb_expand_head(skb, delta, 0, GFP_ATOMIC)) 2453 goto fail; 2454 2455 if (sk && is_skb_wmem(skb)) { 2456 delta = skb_end_offset(skb) - osize; 2457 refcount_add(delta, &sk->sk_wmem_alloc); 2458 skb->truesize += delta; 2459 } 2460 return skb; 2461 2462fail: 2463 kfree_skb(skb); 2464 return NULL; 2465} 2466EXPORT_SYMBOL(skb_expand_head); 2467 2468/** 2469 * skb_copy_expand - copy and expand sk_buff 2470 * @skb: buffer to copy 2471 * @newheadroom: new free bytes at head 2472 * @newtailroom: new free bytes at tail 2473 * @gfp_mask: allocation priority 2474 * 2475 * Make a copy of both an &sk_buff and its data and while doing so 2476 * allocate additional space. 2477 * 2478 * This is used when the caller wishes to modify the data and needs a 2479 * private copy of the data to alter as well as more space for new fields. 2480 * Returns %NULL on failure or the pointer to the buffer 2481 * on success. The returned buffer has a reference count of 1. 2482 * 2483 * You must pass %GFP_ATOMIC as the allocation priority if this function 2484 * is called from an interrupt. 2485 */ 2486struct sk_buff *skb_copy_expand(const struct sk_buff *skb, 2487 int newheadroom, int newtailroom, 2488 gfp_t gfp_mask) 2489{ 2490 /* 2491 * Allocate the copy buffer 2492 */ 2493 int head_copy_len, head_copy_off; 2494 struct sk_buff *n; 2495 int oldheadroom; 2496 2497 if (!skb_frags_readable(skb)) 2498 return NULL; 2499 2500 if (WARN_ON_ONCE(skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST)) 2501 return NULL; 2502 2503 oldheadroom = skb_headroom(skb); 2504 n = __alloc_skb(newheadroom + skb->len + newtailroom, 2505 gfp_mask, skb_alloc_rx_flag(skb), 2506 NUMA_NO_NODE); 2507 if (!n) 2508 return NULL; 2509 2510 skb_reserve(n, newheadroom); 2511 2512 /* Set the tail pointer and length */ 2513 skb_put(n, skb->len); 2514 2515 head_copy_len = oldheadroom; 2516 head_copy_off = 0; 2517 if (newheadroom <= head_copy_len) 2518 head_copy_len = newheadroom; 2519 else 2520 head_copy_off = newheadroom - head_copy_len; 2521 2522 /* Copy the linear header and data. */ 2523 BUG_ON(skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off, 2524 skb->len + head_copy_len)); 2525 2526 skb_copy_header(n, skb); 2527 2528 skb_headers_offset_update(n, newheadroom - oldheadroom); 2529 2530 return n; 2531} 2532EXPORT_SYMBOL(skb_copy_expand); 2533 2534/** 2535 * __skb_pad - zero pad the tail of an skb 2536 * @skb: buffer to pad 2537 * @pad: space to pad 2538 * @free_on_error: free buffer on error 2539 * 2540 * Ensure that a buffer is followed by a padding area that is zero 2541 * filled. Used by network drivers which may DMA or transfer data 2542 * beyond the buffer end onto the wire. 2543 * 2544 * May return error in out of memory cases. The skb is freed on error 2545 * if @free_on_error is true. 2546 */ 2547 2548int __skb_pad(struct sk_buff *skb, int pad, bool free_on_error) 2549{ 2550 int err; 2551 int ntail; 2552 2553 /* If the skbuff is non linear tailroom is always zero.. */ 2554 if (!skb_cloned(skb) && skb_tailroom(skb) >= pad) { 2555 memset(skb->data+skb->len, 0, pad); 2556 return 0; 2557 } 2558 2559 ntail = skb->data_len + pad - (skb->end - skb->tail); 2560 if (likely(skb_cloned(skb) || ntail > 0)) { 2561 err = pskb_expand_head(skb, 0, ntail, GFP_ATOMIC); 2562 if (unlikely(err)) 2563 goto free_skb; 2564 } 2565 2566 /* FIXME: The use of this function with non-linear skb's really needs 2567 * to be audited. 2568 */ 2569 err = skb_linearize(skb); 2570 if (unlikely(err)) 2571 goto free_skb; 2572 2573 memset(skb->data + skb->len, 0, pad); 2574 return 0; 2575 2576free_skb: 2577 if (free_on_error) 2578 kfree_skb(skb); 2579 return err; 2580} 2581EXPORT_SYMBOL(__skb_pad); 2582 2583/** 2584 * pskb_put - add data to the tail of a potentially fragmented buffer 2585 * @skb: start of the buffer to use 2586 * @tail: tail fragment of the buffer to use 2587 * @len: amount of data to add 2588 * 2589 * This function extends the used data area of the potentially 2590 * fragmented buffer. @tail must be the last fragment of @skb -- or 2591 * @skb itself. If this would exceed the total buffer size the kernel 2592 * will panic. A pointer to the first byte of the extra data is 2593 * returned. 2594 */ 2595 2596void *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len) 2597{ 2598 if (tail != skb) { 2599 skb->data_len += len; 2600 skb->len += len; 2601 } 2602 return skb_put(tail, len); 2603} 2604EXPORT_SYMBOL_GPL(pskb_put); 2605 2606/** 2607 * skb_put - add data to a buffer 2608 * @skb: buffer to use 2609 * @len: amount of data to add 2610 * 2611 * This function extends the used data area of the buffer. If this would 2612 * exceed the total buffer size the kernel will panic. A pointer to the 2613 * first byte of the extra data is returned. 2614 */ 2615void *skb_put(struct sk_buff *skb, unsigned int len) 2616{ 2617 void *tmp = skb_tail_pointer(skb); 2618 SKB_LINEAR_ASSERT(skb); 2619 skb->tail += len; 2620 skb->len += len; 2621 if (unlikely(skb->tail > skb->end)) 2622 skb_over_panic(skb, len, __builtin_return_address(0)); 2623 return tmp; 2624} 2625EXPORT_SYMBOL(skb_put); 2626 2627/** 2628 * skb_push - add data to the start of a buffer 2629 * @skb: buffer to use 2630 * @len: amount of data to add 2631 * 2632 * This function extends the used data area of the buffer at the buffer 2633 * start. If this would exceed the total buffer headroom the kernel will 2634 * panic. A pointer to the first byte of the extra data is returned. 2635 */ 2636void *skb_push(struct sk_buff *skb, unsigned int len) 2637{ 2638 skb->data -= len; 2639 skb->len += len; 2640 if (unlikely(skb->data < skb->head)) 2641 skb_under_panic(skb, len, __builtin_return_address(0)); 2642 return skb->data; 2643} 2644EXPORT_SYMBOL(skb_push); 2645 2646/** 2647 * skb_pull - remove data from the start of a buffer 2648 * @skb: buffer to use 2649 * @len: amount of data to remove 2650 * 2651 * This function removes data from the start of a buffer, returning 2652 * the memory to the headroom. A pointer to the next data in the buffer 2653 * is returned. Once the data has been pulled future pushes will overwrite 2654 * the old data. 2655 */ 2656void *skb_pull(struct sk_buff *skb, unsigned int len) 2657{ 2658 return skb_pull_inline(skb, len); 2659} 2660EXPORT_SYMBOL(skb_pull); 2661 2662/** 2663 * skb_pull_data - remove data from the start of a buffer returning its 2664 * original position. 2665 * @skb: buffer to use 2666 * @len: amount of data to remove 2667 * 2668 * This function removes data from the start of a buffer, returning 2669 * the memory to the headroom. A pointer to the original data in the buffer 2670 * is returned after checking if there is enough data to pull. Once the 2671 * data has been pulled future pushes will overwrite the old data. 2672 */ 2673void *skb_pull_data(struct sk_buff *skb, size_t len) 2674{ 2675 void *data = skb->data; 2676 2677 if (skb->len < len) 2678 return NULL; 2679 2680 skb_pull(skb, len); 2681 2682 return data; 2683} 2684EXPORT_SYMBOL(skb_pull_data); 2685 2686/** 2687 * skb_trim - remove end from a buffer 2688 * @skb: buffer to alter 2689 * @len: new length 2690 * 2691 * Cut the length of a buffer down by removing data from the tail. If 2692 * the buffer is already under the length specified it is not modified. 2693 * The skb must be linear. 2694 */ 2695void skb_trim(struct sk_buff *skb, unsigned int len) 2696{ 2697 if (skb->len > len) 2698 __skb_trim(skb, len); 2699} 2700EXPORT_SYMBOL(skb_trim); 2701 2702/* Trims skb to length len. It can change skb pointers. 2703 */ 2704 2705int ___pskb_trim(struct sk_buff *skb, unsigned int len) 2706{ 2707 struct sk_buff **fragp; 2708 struct sk_buff *frag; 2709 int offset = skb_headlen(skb); 2710 int nfrags = skb_shinfo(skb)->nr_frags; 2711 int i; 2712 int err; 2713 2714 if (skb_cloned(skb) && 2715 unlikely((err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))) 2716 return err; 2717 2718 i = 0; 2719 if (offset >= len) 2720 goto drop_pages; 2721 2722 for (; i < nfrags; i++) { 2723 int end = offset + skb_frag_size(&skb_shinfo(skb)->frags[i]); 2724 2725 if (end < len) { 2726 offset = end; 2727 continue; 2728 } 2729 2730 skb_frag_size_set(&skb_shinfo(skb)->frags[i++], len - offset); 2731 2732drop_pages: 2733 skb_shinfo(skb)->nr_frags = i; 2734 2735 for (; i < nfrags; i++) 2736 skb_frag_unref(skb, i); 2737 2738 if (skb_has_frag_list(skb)) 2739 skb_drop_fraglist(skb); 2740 goto done; 2741 } 2742 2743 for (fragp = &skb_shinfo(skb)->frag_list; (frag = *fragp); 2744 fragp = &frag->next) { 2745 int end = offset + frag->len; 2746 2747 if (skb_shared(frag)) { 2748 struct sk_buff *nfrag; 2749 2750 nfrag = skb_clone(frag, GFP_ATOMIC); 2751 if (unlikely(!nfrag)) 2752 return -ENOMEM; 2753 2754 nfrag->next = frag->next; 2755 consume_skb(frag); 2756 frag = nfrag; 2757 *fragp = frag; 2758 } 2759 2760 if (end < len) { 2761 offset = end; 2762 continue; 2763 } 2764 2765 if (end > len && 2766 unlikely((err = pskb_trim(frag, len - offset)))) 2767 return err; 2768 2769 if (frag->next) 2770 skb_drop_list(&frag->next); 2771 break; 2772 } 2773 2774done: 2775 if (len > skb_headlen(skb)) { 2776 skb->data_len -= skb->len - len; 2777 skb->len = len; 2778 } else { 2779 skb->len = len; 2780 skb->data_len = 0; 2781 skb_set_tail_pointer(skb, len); 2782 } 2783 2784 if (!skb->sk || skb->destructor == sock_edemux) 2785 skb_condense(skb); 2786 return 0; 2787} 2788EXPORT_SYMBOL(___pskb_trim); 2789 2790/* Note : use pskb_trim_rcsum() instead of calling this directly 2791 */ 2792int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len) 2793{ 2794 if (skb->ip_summed == CHECKSUM_COMPLETE) { 2795 int delta = skb->len - len; 2796 2797 skb->csum = csum_block_sub(skb->csum, 2798 skb_checksum(skb, len, delta, 0), 2799 len); 2800 } else if (skb->ip_summed == CHECKSUM_PARTIAL) { 2801 int hdlen = (len > skb_headlen(skb)) ? skb_headlen(skb) : len; 2802 int offset = skb_checksum_start_offset(skb) + skb->csum_offset; 2803 2804 if (offset + sizeof(__sum16) > hdlen) 2805 return -EINVAL; 2806 } 2807 return __pskb_trim(skb, len); 2808} 2809EXPORT_SYMBOL(pskb_trim_rcsum_slow); 2810 2811/** 2812 * __pskb_pull_tail - advance tail of skb header 2813 * @skb: buffer to reallocate 2814 * @delta: number of bytes to advance tail 2815 * 2816 * The function makes a sense only on a fragmented &sk_buff, 2817 * it expands header moving its tail forward and copying necessary 2818 * data from fragmented part. 2819 * 2820 * &sk_buff MUST have reference count of 1. 2821 * 2822 * Returns %NULL (and &sk_buff does not change) if pull failed 2823 * or value of new tail of skb in the case of success. 2824 * 2825 * All the pointers pointing into skb header may change and must be 2826 * reloaded after call to this function. 2827 */ 2828 2829/* Moves tail of skb head forward, copying data from fragmented part, 2830 * when it is necessary. 2831 * 1. It may fail due to malloc failure. 2832 * 2. It may change skb pointers. 2833 * 2834 * It is pretty complicated. Luckily, it is called only in exceptional cases. 2835 */ 2836void *__pskb_pull_tail(struct sk_buff *skb, int delta) 2837{ 2838 /* If skb has not enough free space at tail, get new one 2839 * plus 128 bytes for future expansions. If we have enough 2840 * room at tail, reallocate without expansion only if skb is cloned. 2841 */ 2842 int i, k, eat = (skb->tail + delta) - skb->end; 2843 2844 if (!skb_frags_readable(skb)) 2845 return NULL; 2846 2847 if (eat > 0 || skb_cloned(skb)) { 2848 if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0, 2849 GFP_ATOMIC)) 2850 return NULL; 2851 } 2852 2853 BUG_ON(skb_copy_bits(skb, skb_headlen(skb), 2854 skb_tail_pointer(skb), delta)); 2855 2856 /* Optimization: no fragments, no reasons to preestimate 2857 * size of pulled pages. Superb. 2858 */ 2859 if (!skb_has_frag_list(skb)) 2860 goto pull_pages; 2861 2862 /* Estimate size of pulled pages. */ 2863 eat = delta; 2864 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 2865 int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); 2866 2867 if (size >= eat) 2868 goto pull_pages; 2869 eat -= size; 2870 } 2871 2872 /* If we need update frag list, we are in troubles. 2873 * Certainly, it is possible to add an offset to skb data, 2874 * but taking into account that pulling is expected to 2875 * be very rare operation, it is worth to fight against 2876 * further bloating skb head and crucify ourselves here instead. 2877 * Pure masohism, indeed. 8)8) 2878 */ 2879 if (eat) { 2880 struct sk_buff *list = skb_shinfo(skb)->frag_list; 2881 struct sk_buff *clone = NULL; 2882 struct sk_buff *insp = NULL; 2883 2884 do { 2885 if (list->len <= eat) { 2886 /* Eaten as whole. */ 2887 eat -= list->len; 2888 list = list->next; 2889 insp = list; 2890 } else { 2891 /* Eaten partially. */ 2892 if (skb_is_gso(skb) && !list->head_frag && 2893 skb_headlen(list)) 2894 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; 2895 2896 if (skb_shared(list)) { 2897 /* Sucks! We need to fork list. :-( */ 2898 clone = skb_clone(list, GFP_ATOMIC); 2899 if (!clone) 2900 return NULL; 2901 insp = list->next; 2902 list = clone; 2903 } else { 2904 /* This may be pulled without 2905 * problems. */ 2906 insp = list; 2907 } 2908 if (!pskb_pull(list, eat)) { 2909 kfree_skb(clone); 2910 return NULL; 2911 } 2912 break; 2913 } 2914 } while (eat); 2915 2916 /* Free pulled out fragments. */ 2917 while ((list = skb_shinfo(skb)->frag_list) != insp) { 2918 skb_shinfo(skb)->frag_list = list->next; 2919 consume_skb(list); 2920 } 2921 /* And insert new clone at head. */ 2922 if (clone) { 2923 clone->next = list; 2924 skb_shinfo(skb)->frag_list = clone; 2925 } 2926 } 2927 /* Success! Now we may commit changes to skb data. */ 2928 2929pull_pages: 2930 eat = delta; 2931 k = 0; 2932 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 2933 int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); 2934 2935 if (size <= eat) { 2936 skb_frag_unref(skb, i); 2937 eat -= size; 2938 } else { 2939 skb_frag_t *frag = &skb_shinfo(skb)->frags[k]; 2940 2941 *frag = skb_shinfo(skb)->frags[i]; 2942 if (eat) { 2943 skb_frag_off_add(frag, eat); 2944 skb_frag_size_sub(frag, eat); 2945 if (!i) 2946 goto end; 2947 eat = 0; 2948 } 2949 k++; 2950 } 2951 } 2952 skb_shinfo(skb)->nr_frags = k; 2953 2954end: 2955 skb->tail += delta; 2956 skb->data_len -= delta; 2957 2958 if (!skb->data_len) 2959 skb_zcopy_clear(skb, false); 2960 2961 return skb_tail_pointer(skb); 2962} 2963EXPORT_SYMBOL(__pskb_pull_tail); 2964 2965/** 2966 * skb_copy_bits - copy bits from skb to kernel buffer 2967 * @skb: source skb 2968 * @offset: offset in source 2969 * @to: destination buffer 2970 * @len: number of bytes to copy 2971 * 2972 * Copy the specified number of bytes from the source skb to the 2973 * destination buffer. 2974 * 2975 * CAUTION ! : 2976 * If its prototype is ever changed, 2977 * check arch/{*}/net/{*}.S files, 2978 * since it is called from BPF assembly code. 2979 */ 2980int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len) 2981{ 2982 int start = skb_headlen(skb); 2983 struct sk_buff *frag_iter; 2984 int i, copy; 2985 2986 if (offset > (int)skb->len - len) 2987 goto fault; 2988 2989 /* Copy header. */ 2990 if ((copy = start - offset) > 0) { 2991 if (copy > len) 2992 copy = len; 2993 skb_copy_from_linear_data_offset(skb, offset, to, copy); 2994 if ((len -= copy) == 0) 2995 return 0; 2996 offset += copy; 2997 to += copy; 2998 } 2999 3000 if (!skb_frags_readable(skb)) 3001 goto fault; 3002 3003 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 3004 int end; 3005 skb_frag_t *f = &skb_shinfo(skb)->frags[i]; 3006 3007 WARN_ON(start > offset + len); 3008 3009 end = start + skb_frag_size(f); 3010 if ((copy = end - offset) > 0) { 3011 u32 p_off, p_len, copied; 3012 struct page *p; 3013 u8 *vaddr; 3014 3015 if (copy > len) 3016 copy = len; 3017 3018 skb_frag_foreach_page(f, 3019 skb_frag_off(f) + offset - start, 3020 copy, p, p_off, p_len, copied) { 3021 vaddr = kmap_atomic(p); 3022 memcpy(to + copied, vaddr + p_off, p_len); 3023 kunmap_atomic(vaddr); 3024 } 3025 3026 if ((len -= copy) == 0) 3027 return 0; 3028 offset += copy; 3029 to += copy; 3030 } 3031 start = end; 3032 } 3033 3034 skb_walk_frags(skb, frag_iter) { 3035 int end; 3036 3037 WARN_ON(start > offset + len); 3038 3039 end = start + frag_iter->len; 3040 if ((copy = end - offset) > 0) { 3041 if (copy > len) 3042 copy = len; 3043 if (skb_copy_bits(frag_iter, offset - start, to, copy)) 3044 goto fault; 3045 if ((len -= copy) == 0) 3046 return 0; 3047 offset += copy; 3048 to += copy; 3049 } 3050 start = end; 3051 } 3052 3053 if (!len) 3054 return 0; 3055 3056fault: 3057 return -EFAULT; 3058} 3059EXPORT_SYMBOL(skb_copy_bits); 3060 3061/* 3062 * Callback from splice_to_pipe(), if we need to release some pages 3063 * at the end of the spd in case we error'ed out in filling the pipe. 3064 */ 3065static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i) 3066{ 3067 put_page(spd->pages[i]); 3068} 3069 3070static struct page *linear_to_page(struct page *page, unsigned int *len, 3071 unsigned int *offset, 3072 struct sock *sk) 3073{ 3074 struct page_frag *pfrag = sk_page_frag(sk); 3075 3076 if (!sk_page_frag_refill(sk, pfrag)) 3077 return NULL; 3078 3079 *len = min_t(unsigned int, *len, pfrag->size - pfrag->offset); 3080 3081 memcpy(page_address(pfrag->page) + pfrag->offset, 3082 page_address(page) + *offset, *len); 3083 *offset = pfrag->offset; 3084 pfrag->offset += *len; 3085 3086 return pfrag->page; 3087} 3088 3089static bool spd_can_coalesce(const struct splice_pipe_desc *spd, 3090 struct page *page, 3091 unsigned int offset) 3092{ 3093 return spd->nr_pages && 3094 spd->pages[spd->nr_pages - 1] == page && 3095 (spd->partial[spd->nr_pages - 1].offset + 3096 spd->partial[spd->nr_pages - 1].len == offset); 3097} 3098 3099/* 3100 * Fill page/offset/length into spd, if it can hold more pages. 3101 */ 3102static bool spd_fill_page(struct splice_pipe_desc *spd, struct page *page, 3103 unsigned int *len, unsigned int offset, bool linear, 3104 struct sock *sk) 3105{ 3106 if (unlikely(spd->nr_pages == MAX_SKB_FRAGS)) 3107 return true; 3108 3109 if (linear) { 3110 page = linear_to_page(page, len, &offset, sk); 3111 if (!page) 3112 return true; 3113 } 3114 if (spd_can_coalesce(spd, page, offset)) { 3115 spd->partial[spd->nr_pages - 1].len += *len; 3116 return false; 3117 } 3118 get_page(page); 3119 spd->pages[spd->nr_pages] = page; 3120 spd->partial[spd->nr_pages].len = *len; 3121 spd->partial[spd->nr_pages].offset = offset; 3122 spd->nr_pages++; 3123 3124 return false; 3125} 3126 3127static bool __splice_segment(struct page *page, unsigned int poff, 3128 unsigned int plen, unsigned int *off, 3129 unsigned int *len, 3130 struct splice_pipe_desc *spd, bool linear, 3131 struct sock *sk) 3132{ 3133 if (!*len) 3134 return true; 3135 3136 /* skip this segment if already processed */ 3137 if (*off >= plen) { 3138 *off -= plen; 3139 return false; 3140 } 3141 3142 /* ignore any bits we already processed */ 3143 poff += *off; 3144 plen -= *off; 3145 *off = 0; 3146 3147 do { 3148 unsigned int flen = min(*len, plen); 3149 3150 if (spd_fill_page(spd, page, &flen, poff, linear, sk)) 3151 return true; 3152 poff += flen; 3153 plen -= flen; 3154 *len -= flen; 3155 if (!*len) 3156 return true; 3157 } while (plen); 3158 3159 return false; 3160} 3161 3162/* 3163 * Map linear and fragment data from the skb to spd. It reports true if the 3164 * pipe is full or if we already spliced the requested length. 3165 */ 3166static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, 3167 unsigned int *offset, unsigned int *len, 3168 struct splice_pipe_desc *spd, struct sock *sk) 3169{ 3170 struct sk_buff *iter; 3171 int seg; 3172 3173 /* map the linear part : 3174 * If skb->head_frag is set, this 'linear' part is backed by a 3175 * fragment, and if the head is not shared with any clones then 3176 * we can avoid a copy since we own the head portion of this page. 3177 */ 3178 if (__splice_segment(virt_to_page(skb->data), 3179 (unsigned long) skb->data & (PAGE_SIZE - 1), 3180 skb_headlen(skb), 3181 offset, len, spd, 3182 skb_head_is_locked(skb), 3183 sk)) 3184 return true; 3185 3186 /* 3187 * then map the fragments 3188 */ 3189 if (!skb_frags_readable(skb)) 3190 return false; 3191 3192 for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) { 3193 const skb_frag_t *f = &skb_shinfo(skb)->frags[seg]; 3194 3195 if (WARN_ON_ONCE(!skb_frag_page(f))) 3196 return false; 3197 3198 if (__splice_segment(skb_frag_page(f), 3199 skb_frag_off(f), skb_frag_size(f), 3200 offset, len, spd, false, sk)) 3201 return true; 3202 } 3203 3204 skb_walk_frags(skb, iter) { 3205 if (*offset >= iter->len) { 3206 *offset -= iter->len; 3207 continue; 3208 } 3209 /* __skb_splice_bits() only fails if the output has no room 3210 * left, so no point in going over the frag_list for the error 3211 * case. 3212 */ 3213 if (__skb_splice_bits(iter, pipe, offset, len, spd, sk)) 3214 return true; 3215 } 3216 3217 return false; 3218} 3219 3220/* 3221 * Map data from the skb to a pipe. Should handle both the linear part, 3222 * the fragments, and the frag list. 3223 */ 3224int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset, 3225 struct pipe_inode_info *pipe, unsigned int tlen, 3226 unsigned int flags) 3227{ 3228 struct partial_page partial[MAX_SKB_FRAGS]; 3229 struct page *pages[MAX_SKB_FRAGS]; 3230 struct splice_pipe_desc spd = { 3231 .pages = pages, 3232 .partial = partial, 3233 .nr_pages_max = MAX_SKB_FRAGS, 3234 .ops = &nosteal_pipe_buf_ops, 3235 .spd_release = sock_spd_release, 3236 }; 3237 int ret = 0; 3238 3239 __skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk); 3240 3241 if (spd.nr_pages) 3242 ret = splice_to_pipe(pipe, &spd); 3243 3244 return ret; 3245} 3246EXPORT_SYMBOL_GPL(skb_splice_bits); 3247 3248static int sendmsg_locked(struct sock *sk, struct msghdr *msg) 3249{ 3250 struct socket *sock = sk->sk_socket; 3251 size_t size = msg_data_left(msg); 3252 3253 if (!sock) 3254 return -EINVAL; 3255 3256 if (!sock->ops->sendmsg_locked) 3257 return sock_no_sendmsg_locked(sk, msg, size); 3258 3259 return sock->ops->sendmsg_locked(sk, msg, size); 3260} 3261 3262static int sendmsg_unlocked(struct sock *sk, struct msghdr *msg) 3263{ 3264 struct socket *sock = sk->sk_socket; 3265 3266 if (!sock) 3267 return -EINVAL; 3268 return sock_sendmsg(sock, msg); 3269} 3270 3271typedef int (*sendmsg_func)(struct sock *sk, struct msghdr *msg); 3272static int __skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, 3273 int len, sendmsg_func sendmsg, int flags) 3274{ 3275 int more_hint = sk_is_tcp(sk) ? MSG_MORE : 0; 3276 unsigned int orig_len = len; 3277 struct sk_buff *head = skb; 3278 unsigned short fragidx; 3279 int slen, ret; 3280 3281do_frag_list: 3282 3283 /* Deal with head data */ 3284 while (offset < skb_headlen(skb) && len) { 3285 struct kvec kv; 3286 struct msghdr msg; 3287 3288 slen = min_t(int, len, skb_headlen(skb) - offset); 3289 kv.iov_base = skb->data + offset; 3290 kv.iov_len = slen; 3291 memset(&msg, 0, sizeof(msg)); 3292 msg.msg_flags = MSG_DONTWAIT | flags; 3293 if (slen < len) 3294 msg.msg_flags |= more_hint; 3295 3296 iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, &kv, 1, slen); 3297 ret = INDIRECT_CALL_2(sendmsg, sendmsg_locked, 3298 sendmsg_unlocked, sk, &msg); 3299 if (ret <= 0) 3300 goto error; 3301 3302 offset += ret; 3303 len -= ret; 3304 } 3305 3306 /* All the data was skb head? */ 3307 if (!len) 3308 goto out; 3309 3310 /* Make offset relative to start of frags */ 3311 offset -= skb_headlen(skb); 3312 3313 /* Find where we are in frag list */ 3314 for (fragidx = 0; fragidx < skb_shinfo(skb)->nr_frags; fragidx++) { 3315 skb_frag_t *frag = &skb_shinfo(skb)->frags[fragidx]; 3316 3317 if (offset < skb_frag_size(frag)) 3318 break; 3319 3320 offset -= skb_frag_size(frag); 3321 } 3322 3323 for (; len && fragidx < skb_shinfo(skb)->nr_frags; fragidx++) { 3324 skb_frag_t *frag = &skb_shinfo(skb)->frags[fragidx]; 3325 3326 slen = min_t(size_t, len, skb_frag_size(frag) - offset); 3327 3328 while (slen) { 3329 struct bio_vec bvec; 3330 struct msghdr msg = { 3331 .msg_flags = MSG_SPLICE_PAGES | MSG_DONTWAIT | 3332 flags, 3333 }; 3334 3335 if (slen < len) 3336 msg.msg_flags |= more_hint; 3337 bvec_set_page(&bvec, skb_frag_page(frag), slen, 3338 skb_frag_off(frag) + offset); 3339 iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, 3340 slen); 3341 3342 ret = INDIRECT_CALL_2(sendmsg, sendmsg_locked, 3343 sendmsg_unlocked, sk, &msg); 3344 if (ret <= 0) 3345 goto error; 3346 3347 len -= ret; 3348 offset += ret; 3349 slen -= ret; 3350 } 3351 3352 offset = 0; 3353 } 3354 3355 if (len) { 3356 /* Process any frag lists */ 3357 3358 if (skb == head) { 3359 if (skb_has_frag_list(skb)) { 3360 skb = skb_shinfo(skb)->frag_list; 3361 goto do_frag_list; 3362 } 3363 } else if (skb->next) { 3364 skb = skb->next; 3365 goto do_frag_list; 3366 } 3367 } 3368 3369out: 3370 return orig_len - len; 3371 3372error: 3373 return orig_len == len ? ret : orig_len - len; 3374} 3375 3376/* Send skb data on a socket. Socket must be locked. */ 3377int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset, 3378 int len) 3379{ 3380 return __skb_send_sock(sk, skb, offset, len, sendmsg_locked, 0); 3381} 3382EXPORT_SYMBOL_GPL(skb_send_sock_locked); 3383 3384int skb_send_sock_locked_with_flags(struct sock *sk, struct sk_buff *skb, 3385 int offset, int len, int flags) 3386{ 3387 return __skb_send_sock(sk, skb, offset, len, sendmsg_locked, flags); 3388} 3389EXPORT_SYMBOL_GPL(skb_send_sock_locked_with_flags); 3390 3391/* Send skb data on a socket. Socket must be unlocked. */ 3392int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len) 3393{ 3394 return __skb_send_sock(sk, skb, offset, len, sendmsg_unlocked, 0); 3395} 3396 3397/** 3398 * skb_store_bits - store bits from kernel buffer to skb 3399 * @skb: destination buffer 3400 * @offset: offset in destination 3401 * @from: source buffer 3402 * @len: number of bytes to copy 3403 * 3404 * Copy the specified number of bytes from the source buffer to the 3405 * destination skb. This function handles all the messy bits of 3406 * traversing fragment lists and such. 3407 */ 3408 3409int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len) 3410{ 3411 int start = skb_headlen(skb); 3412 struct sk_buff *frag_iter; 3413 int i, copy; 3414 3415 if (offset > (int)skb->len - len) 3416 goto fault; 3417 3418 if ((copy = start - offset) > 0) { 3419 if (copy > len) 3420 copy = len; 3421 skb_copy_to_linear_data_offset(skb, offset, from, copy); 3422 if ((len -= copy) == 0) 3423 return 0; 3424 offset += copy; 3425 from += copy; 3426 } 3427 3428 if (!skb_frags_readable(skb)) 3429 goto fault; 3430 3431 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 3432 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 3433 int end; 3434 3435 WARN_ON(start > offset + len); 3436 3437 end = start + skb_frag_size(frag); 3438 if ((copy = end - offset) > 0) { 3439 u32 p_off, p_len, copied; 3440 struct page *p; 3441 u8 *vaddr; 3442 3443 if (copy > len) 3444 copy = len; 3445 3446 skb_frag_foreach_page(frag, 3447 skb_frag_off(frag) + offset - start, 3448 copy, p, p_off, p_len, copied) { 3449 vaddr = kmap_atomic(p); 3450 memcpy(vaddr + p_off, from + copied, p_len); 3451 kunmap_atomic(vaddr); 3452 } 3453 3454 if ((len -= copy) == 0) 3455 return 0; 3456 offset += copy; 3457 from += copy; 3458 } 3459 start = end; 3460 } 3461 3462 skb_walk_frags(skb, frag_iter) { 3463 int end; 3464 3465 WARN_ON(start > offset + len); 3466 3467 end = start + frag_iter->len; 3468 if ((copy = end - offset) > 0) { 3469 if (copy > len) 3470 copy = len; 3471 if (skb_store_bits(frag_iter, offset - start, 3472 from, copy)) 3473 goto fault; 3474 if ((len -= copy) == 0) 3475 return 0; 3476 offset += copy; 3477 from += copy; 3478 } 3479 start = end; 3480 } 3481 if (!len) 3482 return 0; 3483 3484fault: 3485 return -EFAULT; 3486} 3487EXPORT_SYMBOL(skb_store_bits); 3488 3489/* Checksum skb data. */ 3490__wsum skb_checksum(const struct sk_buff *skb, int offset, int len, __wsum csum) 3491{ 3492 int start = skb_headlen(skb); 3493 int i, copy = start - offset; 3494 struct sk_buff *frag_iter; 3495 int pos = 0; 3496 3497 /* Checksum header. */ 3498 if (copy > 0) { 3499 if (copy > len) 3500 copy = len; 3501 csum = csum_partial(skb->data + offset, copy, csum); 3502 if ((len -= copy) == 0) 3503 return csum; 3504 offset += copy; 3505 pos = copy; 3506 } 3507 3508 if (WARN_ON_ONCE(!skb_frags_readable(skb))) 3509 return 0; 3510 3511 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 3512 int end; 3513 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 3514 3515 WARN_ON(start > offset + len); 3516 3517 end = start + skb_frag_size(frag); 3518 if ((copy = end - offset) > 0) { 3519 u32 p_off, p_len, copied; 3520 struct page *p; 3521 __wsum csum2; 3522 u8 *vaddr; 3523 3524 if (copy > len) 3525 copy = len; 3526 3527 skb_frag_foreach_page(frag, 3528 skb_frag_off(frag) + offset - start, 3529 copy, p, p_off, p_len, copied) { 3530 vaddr = kmap_atomic(p); 3531 csum2 = csum_partial(vaddr + p_off, p_len, 0); 3532 kunmap_atomic(vaddr); 3533 csum = csum_block_add(csum, csum2, pos); 3534 pos += p_len; 3535 } 3536 3537 if (!(len -= copy)) 3538 return csum; 3539 offset += copy; 3540 } 3541 start = end; 3542 } 3543 3544 skb_walk_frags(skb, frag_iter) { 3545 int end; 3546 3547 WARN_ON(start > offset + len); 3548 3549 end = start + frag_iter->len; 3550 if ((copy = end - offset) > 0) { 3551 __wsum csum2; 3552 if (copy > len) 3553 copy = len; 3554 csum2 = skb_checksum(frag_iter, offset - start, copy, 3555 0); 3556 csum = csum_block_add(csum, csum2, pos); 3557 if ((len -= copy) == 0) 3558 return csum; 3559 offset += copy; 3560 pos += copy; 3561 } 3562 start = end; 3563 } 3564 BUG_ON(len); 3565 3566 return csum; 3567} 3568EXPORT_SYMBOL(skb_checksum); 3569 3570/* Both of above in one bottle. */ 3571 3572__wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, 3573 u8 *to, int len) 3574{ 3575 int start = skb_headlen(skb); 3576 int i, copy = start - offset; 3577 struct sk_buff *frag_iter; 3578 int pos = 0; 3579 __wsum csum = 0; 3580 3581 /* Copy header. */ 3582 if (copy > 0) { 3583 if (copy > len) 3584 copy = len; 3585 csum = csum_partial_copy_nocheck(skb->data + offset, to, 3586 copy); 3587 if ((len -= copy) == 0) 3588 return csum; 3589 offset += copy; 3590 to += copy; 3591 pos = copy; 3592 } 3593 3594 if (!skb_frags_readable(skb)) 3595 return 0; 3596 3597 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 3598 int end; 3599 3600 WARN_ON(start > offset + len); 3601 3602 end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]); 3603 if ((copy = end - offset) > 0) { 3604 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 3605 u32 p_off, p_len, copied; 3606 struct page *p; 3607 __wsum csum2; 3608 u8 *vaddr; 3609 3610 if (copy > len) 3611 copy = len; 3612 3613 skb_frag_foreach_page(frag, 3614 skb_frag_off(frag) + offset - start, 3615 copy, p, p_off, p_len, copied) { 3616 vaddr = kmap_atomic(p); 3617 csum2 = csum_partial_copy_nocheck(vaddr + p_off, 3618 to + copied, 3619 p_len); 3620 kunmap_atomic(vaddr); 3621 csum = csum_block_add(csum, csum2, pos); 3622 pos += p_len; 3623 } 3624 3625 if (!(len -= copy)) 3626 return csum; 3627 offset += copy; 3628 to += copy; 3629 } 3630 start = end; 3631 } 3632 3633 skb_walk_frags(skb, frag_iter) { 3634 __wsum csum2; 3635 int end; 3636 3637 WARN_ON(start > offset + len); 3638 3639 end = start + frag_iter->len; 3640 if ((copy = end - offset) > 0) { 3641 if (copy > len) 3642 copy = len; 3643 csum2 = skb_copy_and_csum_bits(frag_iter, 3644 offset - start, 3645 to, copy); 3646 csum = csum_block_add(csum, csum2, pos); 3647 if ((len -= copy) == 0) 3648 return csum; 3649 offset += copy; 3650 to += copy; 3651 pos += copy; 3652 } 3653 start = end; 3654 } 3655 BUG_ON(len); 3656 return csum; 3657} 3658EXPORT_SYMBOL(skb_copy_and_csum_bits); 3659 3660#ifdef CONFIG_NET_CRC32C 3661u32 skb_crc32c(const struct sk_buff *skb, int offset, int len, u32 crc) 3662{ 3663 int start = skb_headlen(skb); 3664 int i, copy = start - offset; 3665 struct sk_buff *frag_iter; 3666 3667 if (copy > 0) { 3668 copy = min(copy, len); 3669 crc = crc32c(crc, skb->data + offset, copy); 3670 len -= copy; 3671 if (len == 0) 3672 return crc; 3673 offset += copy; 3674 } 3675 3676 if (WARN_ON_ONCE(!skb_frags_readable(skb))) 3677 return 0; 3678 3679 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 3680 int end; 3681 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 3682 3683 WARN_ON(start > offset + len); 3684 3685 end = start + skb_frag_size(frag); 3686 copy = end - offset; 3687 if (copy > 0) { 3688 u32 p_off, p_len, copied; 3689 struct page *p; 3690 u8 *vaddr; 3691 3692 copy = min(copy, len); 3693 skb_frag_foreach_page(frag, 3694 skb_frag_off(frag) + offset - start, 3695 copy, p, p_off, p_len, copied) { 3696 vaddr = kmap_atomic(p); 3697 crc = crc32c(crc, vaddr + p_off, p_len); 3698 kunmap_atomic(vaddr); 3699 } 3700 len -= copy; 3701 if (len == 0) 3702 return crc; 3703 offset += copy; 3704 } 3705 start = end; 3706 } 3707 3708 skb_walk_frags(skb, frag_iter) { 3709 int end; 3710 3711 WARN_ON(start > offset + len); 3712 3713 end = start + frag_iter->len; 3714 copy = end - offset; 3715 if (copy > 0) { 3716 copy = min(copy, len); 3717 crc = skb_crc32c(frag_iter, offset - start, copy, crc); 3718 len -= copy; 3719 if (len == 0) 3720 return crc; 3721 offset += copy; 3722 } 3723 start = end; 3724 } 3725 BUG_ON(len); 3726 3727 return crc; 3728} 3729EXPORT_SYMBOL(skb_crc32c); 3730#endif /* CONFIG_NET_CRC32C */ 3731 3732__sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len) 3733{ 3734 __sum16 sum; 3735 3736 sum = csum_fold(skb_checksum(skb, 0, len, skb->csum)); 3737 /* See comments in __skb_checksum_complete(). */ 3738 if (likely(!sum)) { 3739 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && 3740 !skb->csum_complete_sw) 3741 netdev_rx_csum_fault(skb->dev, skb); 3742 } 3743 if (!skb_shared(skb)) 3744 skb->csum_valid = !sum; 3745 return sum; 3746} 3747EXPORT_SYMBOL(__skb_checksum_complete_head); 3748 3749/* This function assumes skb->csum already holds pseudo header's checksum, 3750 * which has been changed from the hardware checksum, for example, by 3751 * __skb_checksum_validate_complete(). And, the original skb->csum must 3752 * have been validated unsuccessfully for CHECKSUM_COMPLETE case. 3753 * 3754 * It returns non-zero if the recomputed checksum is still invalid, otherwise 3755 * zero. The new checksum is stored back into skb->csum unless the skb is 3756 * shared. 3757 */ 3758__sum16 __skb_checksum_complete(struct sk_buff *skb) 3759{ 3760 __wsum csum; 3761 __sum16 sum; 3762 3763 csum = skb_checksum(skb, 0, skb->len, 0); 3764 3765 sum = csum_fold(csum_add(skb->csum, csum)); 3766 /* This check is inverted, because we already knew the hardware 3767 * checksum is invalid before calling this function. So, if the 3768 * re-computed checksum is valid instead, then we have a mismatch 3769 * between the original skb->csum and skb_checksum(). This means either 3770 * the original hardware checksum is incorrect or we screw up skb->csum 3771 * when moving skb->data around. 3772 */ 3773 if (likely(!sum)) { 3774 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && 3775 !skb->csum_complete_sw) 3776 netdev_rx_csum_fault(skb->dev, skb); 3777 } 3778 3779 if (!skb_shared(skb)) { 3780 /* Save full packet checksum */ 3781 skb->csum = csum; 3782 skb->ip_summed = CHECKSUM_COMPLETE; 3783 skb->csum_complete_sw = 1; 3784 skb->csum_valid = !sum; 3785 } 3786 3787 return sum; 3788} 3789EXPORT_SYMBOL(__skb_checksum_complete); 3790 3791 /** 3792 * skb_zerocopy_headlen - Calculate headroom needed for skb_zerocopy() 3793 * @from: source buffer 3794 * 3795 * Calculates the amount of linear headroom needed in the 'to' skb passed 3796 * into skb_zerocopy(). 3797 */ 3798unsigned int 3799skb_zerocopy_headlen(const struct sk_buff *from) 3800{ 3801 unsigned int hlen = 0; 3802 3803 if (!from->head_frag || 3804 skb_headlen(from) < L1_CACHE_BYTES || 3805 skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS) { 3806 hlen = skb_headlen(from); 3807 if (!hlen) 3808 hlen = from->len; 3809 } 3810 3811 if (skb_has_frag_list(from)) 3812 hlen = from->len; 3813 3814 return hlen; 3815} 3816EXPORT_SYMBOL_GPL(skb_zerocopy_headlen); 3817 3818/** 3819 * skb_zerocopy - Zero copy skb to skb 3820 * @to: destination buffer 3821 * @from: source buffer 3822 * @len: number of bytes to copy from source buffer 3823 * @hlen: size of linear headroom in destination buffer 3824 * 3825 * Copies up to `len` bytes from `from` to `to` by creating references 3826 * to the frags in the source buffer. 3827 * 3828 * The `hlen` as calculated by skb_zerocopy_headlen() specifies the 3829 * headroom in the `to` buffer. 3830 * 3831 * Return value: 3832 * 0: everything is OK 3833 * -ENOMEM: couldn't orphan frags of @from due to lack of memory 3834 * -EFAULT: skb_copy_bits() found some problem with skb geometry 3835 */ 3836int 3837skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen) 3838{ 3839 int i, j = 0; 3840 int plen = 0; /* length of skb->head fragment */ 3841 int ret; 3842 struct page *page; 3843 unsigned int offset; 3844 3845 BUG_ON(!from->head_frag && !hlen); 3846 3847 /* dont bother with small payloads */ 3848 if (len <= skb_tailroom(to)) 3849 return skb_copy_bits(from, 0, skb_put(to, len), len); 3850 3851 if (hlen) { 3852 ret = skb_copy_bits(from, 0, skb_put(to, hlen), hlen); 3853 if (unlikely(ret)) 3854 return ret; 3855 len -= hlen; 3856 } else { 3857 plen = min_t(int, skb_headlen(from), len); 3858 if (plen) { 3859 page = virt_to_head_page(from->head); 3860 offset = from->data - (unsigned char *)page_address(page); 3861 __skb_fill_netmem_desc(to, 0, page_to_netmem(page), 3862 offset, plen); 3863 get_page(page); 3864 j = 1; 3865 len -= plen; 3866 } 3867 } 3868 3869 skb_len_add(to, len + plen); 3870 3871 if (unlikely(skb_orphan_frags(from, GFP_ATOMIC))) { 3872 skb_tx_error(from); 3873 return -ENOMEM; 3874 } 3875 skb_zerocopy_clone(to, from, GFP_ATOMIC); 3876 3877 for (i = 0; i < skb_shinfo(from)->nr_frags; i++) { 3878 int size; 3879 3880 if (!len) 3881 break; 3882 skb_shinfo(to)->frags[j] = skb_shinfo(from)->frags[i]; 3883 size = min_t(int, skb_frag_size(&skb_shinfo(to)->frags[j]), 3884 len); 3885 skb_frag_size_set(&skb_shinfo(to)->frags[j], size); 3886 len -= size; 3887 skb_frag_ref(to, j); 3888 j++; 3889 } 3890 skb_shinfo(to)->nr_frags = j; 3891 3892 return 0; 3893} 3894EXPORT_SYMBOL_GPL(skb_zerocopy); 3895 3896void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to) 3897{ 3898 __wsum csum; 3899 long csstart; 3900 3901 if (skb->ip_summed == CHECKSUM_PARTIAL) 3902 csstart = skb_checksum_start_offset(skb); 3903 else 3904 csstart = skb_headlen(skb); 3905 3906 BUG_ON(csstart > skb_headlen(skb)); 3907 3908 skb_copy_from_linear_data(skb, to, csstart); 3909 3910 csum = 0; 3911 if (csstart != skb->len) 3912 csum = skb_copy_and_csum_bits(skb, csstart, to + csstart, 3913 skb->len - csstart); 3914 3915 if (skb->ip_summed == CHECKSUM_PARTIAL) { 3916 long csstuff = csstart + skb->csum_offset; 3917 3918 *((__sum16 *)(to + csstuff)) = csum_fold(csum); 3919 } 3920} 3921EXPORT_SYMBOL(skb_copy_and_csum_dev); 3922 3923/** 3924 * skb_dequeue - remove from the head of the queue 3925 * @list: list to dequeue from 3926 * 3927 * Remove the head of the list. The list lock is taken so the function 3928 * may be used safely with other locking list functions. The head item is 3929 * returned or %NULL if the list is empty. 3930 */ 3931 3932struct sk_buff *skb_dequeue(struct sk_buff_head *list) 3933{ 3934 unsigned long flags; 3935 struct sk_buff *result; 3936 3937 spin_lock_irqsave(&list->lock, flags); 3938 result = __skb_dequeue(list); 3939 spin_unlock_irqrestore(&list->lock, flags); 3940 return result; 3941} 3942EXPORT_SYMBOL(skb_dequeue); 3943 3944/** 3945 * skb_dequeue_tail - remove from the tail of the queue 3946 * @list: list to dequeue from 3947 * 3948 * Remove the tail of the list. The list lock is taken so the function 3949 * may be used safely with other locking list functions. The tail item is 3950 * returned or %NULL if the list is empty. 3951 */ 3952struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list) 3953{ 3954 unsigned long flags; 3955 struct sk_buff *result; 3956 3957 spin_lock_irqsave(&list->lock, flags); 3958 result = __skb_dequeue_tail(list); 3959 spin_unlock_irqrestore(&list->lock, flags); 3960 return result; 3961} 3962EXPORT_SYMBOL(skb_dequeue_tail); 3963 3964/** 3965 * skb_queue_purge_reason - empty a list 3966 * @list: list to empty 3967 * @reason: drop reason 3968 * 3969 * Delete all buffers on an &sk_buff list. Each buffer is removed from 3970 * the list and one reference dropped. This function takes the list 3971 * lock and is atomic with respect to other list locking functions. 3972 */ 3973void skb_queue_purge_reason(struct sk_buff_head *list, 3974 enum skb_drop_reason reason) 3975{ 3976 struct sk_buff_head tmp; 3977 unsigned long flags; 3978 3979 if (skb_queue_empty_lockless(list)) 3980 return; 3981 3982 __skb_queue_head_init(&tmp); 3983 3984 spin_lock_irqsave(&list->lock, flags); 3985 skb_queue_splice_init(list, &tmp); 3986 spin_unlock_irqrestore(&list->lock, flags); 3987 3988 __skb_queue_purge_reason(&tmp, reason); 3989} 3990EXPORT_SYMBOL(skb_queue_purge_reason); 3991 3992/** 3993 * skb_rbtree_purge - empty a skb rbtree 3994 * @root: root of the rbtree to empty 3995 * Return value: the sum of truesizes of all purged skbs. 3996 * 3997 * Delete all buffers on an &sk_buff rbtree. Each buffer is removed from 3998 * the list and one reference dropped. This function does not take 3999 * any lock. Synchronization should be handled by the caller (e.g., TCP 4000 * out-of-order queue is protected by the socket lock). 4001 */ 4002unsigned int skb_rbtree_purge(struct rb_root *root) 4003{ 4004 struct rb_node *p = rb_first(root); 4005 unsigned int sum = 0; 4006 4007 while (p) { 4008 struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode); 4009 4010 p = rb_next(p); 4011 rb_erase(&skb->rbnode, root); 4012 sum += skb->truesize; 4013 kfree_skb(skb); 4014 } 4015 return sum; 4016} 4017 4018void skb_errqueue_purge(struct sk_buff_head *list) 4019{ 4020 struct sk_buff *skb, *next; 4021 struct sk_buff_head kill; 4022 unsigned long flags; 4023 4024 __skb_queue_head_init(&kill); 4025 4026 spin_lock_irqsave(&list->lock, flags); 4027 skb_queue_walk_safe(list, skb, next) { 4028 if (SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ZEROCOPY || 4029 SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_TIMESTAMPING) 4030 continue; 4031 __skb_unlink(skb, list); 4032 __skb_queue_tail(&kill, skb); 4033 } 4034 spin_unlock_irqrestore(&list->lock, flags); 4035 __skb_queue_purge(&kill); 4036} 4037EXPORT_SYMBOL(skb_errqueue_purge); 4038 4039/** 4040 * skb_queue_head - queue a buffer at the list head 4041 * @list: list to use 4042 * @newsk: buffer to queue 4043 * 4044 * Queue a buffer at the start of the list. This function takes the 4045 * list lock and can be used safely with other locking &sk_buff functions 4046 * safely. 4047 * 4048 * A buffer cannot be placed on two lists at the same time. 4049 */ 4050void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk) 4051{ 4052 unsigned long flags; 4053 4054 spin_lock_irqsave(&list->lock, flags); 4055 __skb_queue_head(list, newsk); 4056 spin_unlock_irqrestore(&list->lock, flags); 4057} 4058EXPORT_SYMBOL(skb_queue_head); 4059 4060/** 4061 * skb_queue_tail - queue a buffer at the list tail 4062 * @list: list to use 4063 * @newsk: buffer to queue 4064 * 4065 * Queue a buffer at the tail of the list. This function takes the 4066 * list lock and can be used safely with other locking &sk_buff functions 4067 * safely. 4068 * 4069 * A buffer cannot be placed on two lists at the same time. 4070 */ 4071void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk) 4072{ 4073 unsigned long flags; 4074 4075 spin_lock_irqsave(&list->lock, flags); 4076 __skb_queue_tail(list, newsk); 4077 spin_unlock_irqrestore(&list->lock, flags); 4078} 4079EXPORT_SYMBOL(skb_queue_tail); 4080 4081/** 4082 * skb_unlink - remove a buffer from a list 4083 * @skb: buffer to remove 4084 * @list: list to use 4085 * 4086 * Remove a packet from a list. The list locks are taken and this 4087 * function is atomic with respect to other list locked calls 4088 * 4089 * You must know what list the SKB is on. 4090 */ 4091void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list) 4092{ 4093 unsigned long flags; 4094 4095 spin_lock_irqsave(&list->lock, flags); 4096 __skb_unlink(skb, list); 4097 spin_unlock_irqrestore(&list->lock, flags); 4098} 4099EXPORT_SYMBOL(skb_unlink); 4100 4101/** 4102 * skb_append - append a buffer 4103 * @old: buffer to insert after 4104 * @newsk: buffer to insert 4105 * @list: list to use 4106 * 4107 * Place a packet after a given packet in a list. The list locks are taken 4108 * and this function is atomic with respect to other list locked calls. 4109 * A buffer cannot be placed on two lists at the same time. 4110 */ 4111void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list) 4112{ 4113 unsigned long flags; 4114 4115 spin_lock_irqsave(&list->lock, flags); 4116 __skb_queue_after(list, old, newsk); 4117 spin_unlock_irqrestore(&list->lock, flags); 4118} 4119EXPORT_SYMBOL(skb_append); 4120 4121static inline void skb_split_inside_header(struct sk_buff *skb, 4122 struct sk_buff* skb1, 4123 const u32 len, const int pos) 4124{ 4125 int i; 4126 4127 skb_copy_from_linear_data_offset(skb, len, skb_put(skb1, pos - len), 4128 pos - len); 4129 /* And move data appendix as is. */ 4130 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) 4131 skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i]; 4132 4133 skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags; 4134 skb1->unreadable = skb->unreadable; 4135 skb_shinfo(skb)->nr_frags = 0; 4136 skb1->data_len = skb->data_len; 4137 skb1->len += skb1->data_len; 4138 skb->data_len = 0; 4139 skb->len = len; 4140 skb_set_tail_pointer(skb, len); 4141} 4142 4143static inline void skb_split_no_header(struct sk_buff *skb, 4144 struct sk_buff* skb1, 4145 const u32 len, int pos) 4146{ 4147 int i, k = 0; 4148 const int nfrags = skb_shinfo(skb)->nr_frags; 4149 4150 skb_shinfo(skb)->nr_frags = 0; 4151 skb1->len = skb1->data_len = skb->len - len; 4152 skb->len = len; 4153 skb->data_len = len - pos; 4154 4155 for (i = 0; i < nfrags; i++) { 4156 int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); 4157 4158 if (pos + size > len) { 4159 skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i]; 4160 4161 if (pos < len) { 4162 /* Split frag. 4163 * We have two variants in this case: 4164 * 1. Move all the frag to the second 4165 * part, if it is possible. F.e. 4166 * this approach is mandatory for TUX, 4167 * where splitting is expensive. 4168 * 2. Split is accurately. We make this. 4169 */ 4170 skb_frag_ref(skb, i); 4171 skb_frag_off_add(&skb_shinfo(skb1)->frags[0], len - pos); 4172 skb_frag_size_sub(&skb_shinfo(skb1)->frags[0], len - pos); 4173 skb_frag_size_set(&skb_shinfo(skb)->frags[i], len - pos); 4174 skb_shinfo(skb)->nr_frags++; 4175 } 4176 k++; 4177 } else 4178 skb_shinfo(skb)->nr_frags++; 4179 pos += size; 4180 } 4181 skb_shinfo(skb1)->nr_frags = k; 4182 4183 skb1->unreadable = skb->unreadable; 4184} 4185 4186/** 4187 * skb_split - Split fragmented skb to two parts at length len. 4188 * @skb: the buffer to split 4189 * @skb1: the buffer to receive the second part 4190 * @len: new length for skb 4191 */ 4192void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len) 4193{ 4194 int pos = skb_headlen(skb); 4195 const int zc_flags = SKBFL_SHARED_FRAG | SKBFL_PURE_ZEROCOPY; 4196 4197 skb_zcopy_downgrade_managed(skb); 4198 4199 skb_shinfo(skb1)->flags |= skb_shinfo(skb)->flags & zc_flags; 4200 skb_zerocopy_clone(skb1, skb, 0); 4201 if (len < pos) /* Split line is inside header. */ 4202 skb_split_inside_header(skb, skb1, len, pos); 4203 else /* Second chunk has no header, nothing to copy. */ 4204 skb_split_no_header(skb, skb1, len, pos); 4205} 4206EXPORT_SYMBOL(skb_split); 4207 4208/* Shifting from/to a cloned skb is a no-go. 4209 * 4210 * Caller cannot keep skb_shinfo related pointers past calling here! 4211 */ 4212static int skb_prepare_for_shift(struct sk_buff *skb) 4213{ 4214 return skb_unclone_keeptruesize(skb, GFP_ATOMIC); 4215} 4216 4217/** 4218 * skb_shift - Shifts paged data partially from skb to another 4219 * @tgt: buffer into which tail data gets added 4220 * @skb: buffer from which the paged data comes from 4221 * @shiftlen: shift up to this many bytes 4222 * 4223 * Attempts to shift up to shiftlen worth of bytes, which may be less than 4224 * the length of the skb, from skb to tgt. Returns number bytes shifted. 4225 * It's up to caller to free skb if everything was shifted. 4226 * 4227 * If @tgt runs out of frags, the whole operation is aborted. 4228 * 4229 * Skb cannot include anything else but paged data while tgt is allowed 4230 * to have non-paged data as well. 4231 * 4232 * TODO: full sized shift could be optimized but that would need 4233 * specialized skb free'er to handle frags without up-to-date nr_frags. 4234 */ 4235int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) 4236{ 4237 int from, to, merge, todo; 4238 skb_frag_t *fragfrom, *fragto; 4239 4240 BUG_ON(shiftlen > skb->len); 4241 4242 if (skb_headlen(skb)) 4243 return 0; 4244 if (skb_zcopy(tgt) || skb_zcopy(skb)) 4245 return 0; 4246 4247 DEBUG_NET_WARN_ON_ONCE(tgt->pp_recycle != skb->pp_recycle); 4248 DEBUG_NET_WARN_ON_ONCE(skb_cmp_decrypted(tgt, skb)); 4249 4250 todo = shiftlen; 4251 from = 0; 4252 to = skb_shinfo(tgt)->nr_frags; 4253 fragfrom = &skb_shinfo(skb)->frags[from]; 4254 4255 /* Actual merge is delayed until the point when we know we can 4256 * commit all, so that we don't have to undo partial changes 4257 */ 4258 if (!skb_can_coalesce(tgt, to, skb_frag_page(fragfrom), 4259 skb_frag_off(fragfrom))) { 4260 merge = -1; 4261 } else { 4262 merge = to - 1; 4263 4264 todo -= skb_frag_size(fragfrom); 4265 if (todo < 0) { 4266 if (skb_prepare_for_shift(skb) || 4267 skb_prepare_for_shift(tgt)) 4268 return 0; 4269 4270 /* All previous frag pointers might be stale! */ 4271 fragfrom = &skb_shinfo(skb)->frags[from]; 4272 fragto = &skb_shinfo(tgt)->frags[merge]; 4273 4274 skb_frag_size_add(fragto, shiftlen); 4275 skb_frag_size_sub(fragfrom, shiftlen); 4276 skb_frag_off_add(fragfrom, shiftlen); 4277 4278 goto onlymerged; 4279 } 4280 4281 from++; 4282 } 4283 4284 /* Skip full, not-fitting skb to avoid expensive operations */ 4285 if ((shiftlen == skb->len) && 4286 (skb_shinfo(skb)->nr_frags - from) > (MAX_SKB_FRAGS - to)) 4287 return 0; 4288 4289 if (skb_prepare_for_shift(skb) || skb_prepare_for_shift(tgt)) 4290 return 0; 4291 4292 while ((todo > 0) && (from < skb_shinfo(skb)->nr_frags)) { 4293 if (to == MAX_SKB_FRAGS) 4294 return 0; 4295 4296 fragfrom = &skb_shinfo(skb)->frags[from]; 4297 fragto = &skb_shinfo(tgt)->frags[to]; 4298 4299 if (todo >= skb_frag_size(fragfrom)) { 4300 *fragto = *fragfrom; 4301 todo -= skb_frag_size(fragfrom); 4302 from++; 4303 to++; 4304 4305 } else { 4306 __skb_frag_ref(fragfrom); 4307 skb_frag_page_copy(fragto, fragfrom); 4308 skb_frag_off_copy(fragto, fragfrom); 4309 skb_frag_size_set(fragto, todo); 4310 4311 skb_frag_off_add(fragfrom, todo); 4312 skb_frag_size_sub(fragfrom, todo); 4313 todo = 0; 4314 4315 to++; 4316 break; 4317 } 4318 } 4319 4320 /* Ready to "commit" this state change to tgt */ 4321 skb_shinfo(tgt)->nr_frags = to; 4322 4323 if (merge >= 0) { 4324 fragfrom = &skb_shinfo(skb)->frags[0]; 4325 fragto = &skb_shinfo(tgt)->frags[merge]; 4326 4327 skb_frag_size_add(fragto, skb_frag_size(fragfrom)); 4328 __skb_frag_unref(fragfrom, skb->pp_recycle); 4329 } 4330 4331 /* Reposition in the original skb */ 4332 to = 0; 4333 while (from < skb_shinfo(skb)->nr_frags) 4334 skb_shinfo(skb)->frags[to++] = skb_shinfo(skb)->frags[from++]; 4335 skb_shinfo(skb)->nr_frags = to; 4336 4337 BUG_ON(todo > 0 && !skb_shinfo(skb)->nr_frags); 4338 4339onlymerged: 4340 /* Most likely the tgt won't ever need its checksum anymore, skb on 4341 * the other hand might need it if it needs to be resent 4342 */ 4343 tgt->ip_summed = CHECKSUM_PARTIAL; 4344 skb->ip_summed = CHECKSUM_PARTIAL; 4345 4346 skb_len_add(skb, -shiftlen); 4347 skb_len_add(tgt, shiftlen); 4348 4349 return shiftlen; 4350} 4351 4352/** 4353 * skb_prepare_seq_read - Prepare a sequential read of skb data 4354 * @skb: the buffer to read 4355 * @from: lower offset of data to be read 4356 * @to: upper offset of data to be read 4357 * @st: state variable 4358 * 4359 * Initializes the specified state variable. Must be called before 4360 * invoking skb_seq_read() for the first time. 4361 */ 4362void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from, 4363 unsigned int to, struct skb_seq_state *st) 4364{ 4365 st->lower_offset = from; 4366 st->upper_offset = to; 4367 st->root_skb = st->cur_skb = skb; 4368 st->frag_idx = st->stepped_offset = 0; 4369 st->frag_data = NULL; 4370 st->frag_off = 0; 4371} 4372EXPORT_SYMBOL(skb_prepare_seq_read); 4373 4374/** 4375 * skb_seq_read - Sequentially read skb data 4376 * @consumed: number of bytes consumed by the caller so far 4377 * @data: destination pointer for data to be returned 4378 * @st: state variable 4379 * 4380 * Reads a block of skb data at @consumed relative to the 4381 * lower offset specified to skb_prepare_seq_read(). Assigns 4382 * the head of the data block to @data and returns the length 4383 * of the block or 0 if the end of the skb data or the upper 4384 * offset has been reached. 4385 * 4386 * The caller is not required to consume all of the data 4387 * returned, i.e. @consumed is typically set to the number 4388 * of bytes already consumed and the next call to 4389 * skb_seq_read() will return the remaining part of the block. 4390 * 4391 * Note 1: The size of each block of data returned can be arbitrary, 4392 * this limitation is the cost for zerocopy sequential 4393 * reads of potentially non linear data. 4394 * 4395 * Note 2: Fragment lists within fragments are not implemented 4396 * at the moment, state->root_skb could be replaced with 4397 * a stack for this purpose. 4398 */ 4399unsigned int skb_seq_read(unsigned int consumed, const u8 **data, 4400 struct skb_seq_state *st) 4401{ 4402 unsigned int block_limit, abs_offset = consumed + st->lower_offset; 4403 skb_frag_t *frag; 4404 4405 if (unlikely(abs_offset >= st->upper_offset)) { 4406 if (st->frag_data) { 4407 kunmap_atomic(st->frag_data); 4408 st->frag_data = NULL; 4409 } 4410 return 0; 4411 } 4412 4413next_skb: 4414 block_limit = skb_headlen(st->cur_skb) + st->stepped_offset; 4415 4416 if (abs_offset < block_limit && !st->frag_data) { 4417 *data = st->cur_skb->data + (abs_offset - st->stepped_offset); 4418 return block_limit - abs_offset; 4419 } 4420 4421 if (!skb_frags_readable(st->cur_skb)) 4422 return 0; 4423 4424 if (st->frag_idx == 0 && !st->frag_data) 4425 st->stepped_offset += skb_headlen(st->cur_skb); 4426 4427 while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) { 4428 unsigned int pg_idx, pg_off, pg_sz; 4429 4430 frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx]; 4431 4432 pg_idx = 0; 4433 pg_off = skb_frag_off(frag); 4434 pg_sz = skb_frag_size(frag); 4435 4436 if (skb_frag_must_loop(skb_frag_page(frag))) { 4437 pg_idx = (pg_off + st->frag_off) >> PAGE_SHIFT; 4438 pg_off = offset_in_page(pg_off + st->frag_off); 4439 pg_sz = min_t(unsigned int, pg_sz - st->frag_off, 4440 PAGE_SIZE - pg_off); 4441 } 4442 4443 block_limit = pg_sz + st->stepped_offset; 4444 if (abs_offset < block_limit) { 4445 if (!st->frag_data) 4446 st->frag_data = kmap_atomic(skb_frag_page(frag) + pg_idx); 4447 4448 *data = (u8 *)st->frag_data + pg_off + 4449 (abs_offset - st->stepped_offset); 4450 4451 return block_limit - abs_offset; 4452 } 4453 4454 if (st->frag_data) { 4455 kunmap_atomic(st->frag_data); 4456 st->frag_data = NULL; 4457 } 4458 4459 st->stepped_offset += pg_sz; 4460 st->frag_off += pg_sz; 4461 if (st->frag_off == skb_frag_size(frag)) { 4462 st->frag_off = 0; 4463 st->frag_idx++; 4464 } 4465 } 4466 4467 if (st->frag_data) { 4468 kunmap_atomic(st->frag_data); 4469 st->frag_data = NULL; 4470 } 4471 4472 if (st->root_skb == st->cur_skb && skb_has_frag_list(st->root_skb)) { 4473 st->cur_skb = skb_shinfo(st->root_skb)->frag_list; 4474 st->frag_idx = 0; 4475 goto next_skb; 4476 } else if (st->cur_skb->next) { 4477 st->cur_skb = st->cur_skb->next; 4478 st->frag_idx = 0; 4479 goto next_skb; 4480 } 4481 4482 return 0; 4483} 4484EXPORT_SYMBOL(skb_seq_read); 4485 4486/** 4487 * skb_abort_seq_read - Abort a sequential read of skb data 4488 * @st: state variable 4489 * 4490 * Must be called if skb_seq_read() was not called until it 4491 * returned 0. 4492 */ 4493void skb_abort_seq_read(struct skb_seq_state *st) 4494{ 4495 if (st->frag_data) 4496 kunmap_atomic(st->frag_data); 4497} 4498EXPORT_SYMBOL(skb_abort_seq_read); 4499 4500/** 4501 * skb_copy_seq_read() - copy from a skb_seq_state to a buffer 4502 * @st: source skb_seq_state 4503 * @offset: offset in source 4504 * @to: destination buffer 4505 * @len: number of bytes to copy 4506 * 4507 * Copy @len bytes from @offset bytes into the source @st to the destination 4508 * buffer @to. `offset` should increase (or be unchanged) with each subsequent 4509 * call to this function. If offset needs to decrease from the previous use `st` 4510 * should be reset first. 4511 * 4512 * Return: 0 on success or -EINVAL if the copy ended early 4513 */ 4514int skb_copy_seq_read(struct skb_seq_state *st, int offset, void *to, int len) 4515{ 4516 const u8 *data; 4517 u32 sqlen; 4518 4519 for (;;) { 4520 sqlen = skb_seq_read(offset, &data, st); 4521 if (sqlen == 0) 4522 return -EINVAL; 4523 if (sqlen >= len) { 4524 memcpy(to, data, len); 4525 return 0; 4526 } 4527 memcpy(to, data, sqlen); 4528 to += sqlen; 4529 offset += sqlen; 4530 len -= sqlen; 4531 } 4532} 4533EXPORT_SYMBOL(skb_copy_seq_read); 4534 4535#define TS_SKB_CB(state) ((struct skb_seq_state *) &((state)->cb)) 4536 4537static unsigned int skb_ts_get_next_block(unsigned int offset, const u8 **text, 4538 struct ts_config *conf, 4539 struct ts_state *state) 4540{ 4541 return skb_seq_read(offset, text, TS_SKB_CB(state)); 4542} 4543 4544static void skb_ts_finish(struct ts_config *conf, struct ts_state *state) 4545{ 4546 skb_abort_seq_read(TS_SKB_CB(state)); 4547} 4548 4549/** 4550 * skb_find_text - Find a text pattern in skb data 4551 * @skb: the buffer to look in 4552 * @from: search offset 4553 * @to: search limit 4554 * @config: textsearch configuration 4555 * 4556 * Finds a pattern in the skb data according to the specified 4557 * textsearch configuration. Use textsearch_next() to retrieve 4558 * subsequent occurrences of the pattern. Returns the offset 4559 * to the first occurrence or UINT_MAX if no match was found. 4560 */ 4561unsigned int skb_find_text(struct sk_buff *skb, unsigned int from, 4562 unsigned int to, struct ts_config *config) 4563{ 4564 unsigned int patlen = config->ops->get_pattern_len(config); 4565 struct ts_state state; 4566 unsigned int ret; 4567 4568 BUILD_BUG_ON(sizeof(struct skb_seq_state) > sizeof(state.cb)); 4569 4570 config->get_next_block = skb_ts_get_next_block; 4571 config->finish = skb_ts_finish; 4572 4573 skb_prepare_seq_read(skb, from, to, TS_SKB_CB(&state)); 4574 4575 ret = textsearch_find(config, &state); 4576 return (ret + patlen <= to - from ? ret : UINT_MAX); 4577} 4578EXPORT_SYMBOL(skb_find_text); 4579 4580int skb_append_pagefrags(struct sk_buff *skb, struct page *page, 4581 int offset, size_t size, size_t max_frags) 4582{ 4583 int i = skb_shinfo(skb)->nr_frags; 4584 4585 if (skb_can_coalesce(skb, i, page, offset)) { 4586 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size); 4587 } else if (i < max_frags) { 4588 skb_zcopy_downgrade_managed(skb); 4589 get_page(page); 4590 skb_fill_page_desc_noacc(skb, i, page, offset, size); 4591 } else { 4592 return -EMSGSIZE; 4593 } 4594 4595 return 0; 4596} 4597EXPORT_SYMBOL_GPL(skb_append_pagefrags); 4598 4599/** 4600 * skb_pull_rcsum - pull skb and update receive checksum 4601 * @skb: buffer to update 4602 * @len: length of data pulled 4603 * 4604 * This function performs an skb_pull on the packet and updates 4605 * the CHECKSUM_COMPLETE checksum. It should be used on 4606 * receive path processing instead of skb_pull unless you know 4607 * that the checksum difference is zero (e.g., a valid IP header) 4608 * or you are setting ip_summed to CHECKSUM_NONE. 4609 */ 4610void *skb_pull_rcsum(struct sk_buff *skb, unsigned int len) 4611{ 4612 unsigned char *data = skb->data; 4613 4614 BUG_ON(len > skb->len); 4615 __skb_pull(skb, len); 4616 skb_postpull_rcsum(skb, data, len); 4617 return skb->data; 4618} 4619EXPORT_SYMBOL_GPL(skb_pull_rcsum); 4620 4621static inline skb_frag_t skb_head_frag_to_page_desc(struct sk_buff *frag_skb) 4622{ 4623 skb_frag_t head_frag; 4624 struct page *page; 4625 4626 page = virt_to_head_page(frag_skb->head); 4627 skb_frag_fill_page_desc(&head_frag, page, frag_skb->data - 4628 (unsigned char *)page_address(page), 4629 skb_headlen(frag_skb)); 4630 return head_frag; 4631} 4632 4633struct sk_buff *skb_segment_list(struct sk_buff *skb, 4634 netdev_features_t features, 4635 unsigned int offset) 4636{ 4637 struct sk_buff *list_skb = skb_shinfo(skb)->frag_list; 4638 unsigned int tnl_hlen = skb_tnl_header_len(skb); 4639 unsigned int delta_len = 0; 4640 struct sk_buff *tail = NULL; 4641 struct sk_buff *nskb, *tmp; 4642 int len_diff, err; 4643 4644 /* Only skb_gro_receive_list generated skbs arrive here */ 4645 DEBUG_NET_WARN_ON_ONCE(!(skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST)); 4646 4647 skb_push(skb, -skb_network_offset(skb) + offset); 4648 4649 /* Ensure the head is writeable before touching the shared info */ 4650 err = skb_unclone(skb, GFP_ATOMIC); 4651 if (err) 4652 goto err_linearize; 4653 4654 skb_shinfo(skb)->frag_list = NULL; 4655 4656 while (list_skb) { 4657 nskb = list_skb; 4658 list_skb = list_skb->next; 4659 4660 DEBUG_NET_WARN_ON_ONCE(nskb->sk); 4661 4662 err = 0; 4663 if (skb_shared(nskb)) { 4664 tmp = skb_clone(nskb, GFP_ATOMIC); 4665 if (tmp) { 4666 consume_skb(nskb); 4667 nskb = tmp; 4668 err = skb_unclone(nskb, GFP_ATOMIC); 4669 } else { 4670 err = -ENOMEM; 4671 } 4672 } 4673 4674 if (!tail) 4675 skb->next = nskb; 4676 else 4677 tail->next = nskb; 4678 4679 if (unlikely(err)) { 4680 nskb->next = list_skb; 4681 goto err_linearize; 4682 } 4683 4684 tail = nskb; 4685 4686 delta_len += nskb->len; 4687 4688 skb_push(nskb, -skb_network_offset(nskb) + offset); 4689 4690 skb_release_head_state(nskb); 4691 len_diff = skb_network_header_len(nskb) - skb_network_header_len(skb); 4692 __copy_skb_header(nskb, skb); 4693 4694 skb_headers_offset_update(nskb, skb_headroom(nskb) - skb_headroom(skb)); 4695 nskb->transport_header += len_diff; 4696 skb_copy_from_linear_data_offset(skb, -tnl_hlen, 4697 nskb->data - tnl_hlen, 4698 offset + tnl_hlen); 4699 4700 if (skb_needs_linearize(nskb, features) && 4701 __skb_linearize(nskb)) 4702 goto err_linearize; 4703 } 4704 4705 skb->data_len = skb->data_len - delta_len; 4706 skb->len = skb->len - delta_len; 4707 4708 skb_gso_reset(skb); 4709 4710 skb->prev = tail; 4711 4712 if (skb_needs_linearize(skb, features) && 4713 __skb_linearize(skb)) 4714 goto err_linearize; 4715 4716 skb_get(skb); 4717 4718 return skb; 4719 4720err_linearize: 4721 kfree_skb_list(skb->next); 4722 skb->next = NULL; 4723 return ERR_PTR(-ENOMEM); 4724} 4725EXPORT_SYMBOL_GPL(skb_segment_list); 4726 4727/** 4728 * skb_segment - Perform protocol segmentation on skb. 4729 * @head_skb: buffer to segment 4730 * @features: features for the output path (see dev->features) 4731 * 4732 * This function performs segmentation on the given skb. It returns 4733 * a pointer to the first in a list of new skbs for the segments. 4734 * In case of error it returns ERR_PTR(err). 4735 */ 4736struct sk_buff *skb_segment(struct sk_buff *head_skb, 4737 netdev_features_t features) 4738{ 4739 struct sk_buff *segs = NULL; 4740 struct sk_buff *tail = NULL; 4741 struct sk_buff *list_skb = skb_shinfo(head_skb)->frag_list; 4742 unsigned int mss = skb_shinfo(head_skb)->gso_size; 4743 unsigned int doffset = head_skb->data - skb_mac_header(head_skb); 4744 unsigned int offset = doffset; 4745 unsigned int tnl_hlen = skb_tnl_header_len(head_skb); 4746 unsigned int partial_segs = 0; 4747 unsigned int headroom; 4748 unsigned int len = head_skb->len; 4749 struct sk_buff *frag_skb; 4750 skb_frag_t *frag; 4751 __be16 proto; 4752 bool csum, sg; 4753 int err = -ENOMEM; 4754 int i = 0; 4755 int nfrags, pos; 4756 4757 if ((skb_shinfo(head_skb)->gso_type & SKB_GSO_DODGY) && 4758 mss != GSO_BY_FRAGS && mss != skb_headlen(head_skb)) { 4759 struct sk_buff *check_skb; 4760 4761 for (check_skb = list_skb; check_skb; check_skb = check_skb->next) { 4762 if (skb_headlen(check_skb) && !check_skb->head_frag) { 4763 /* gso_size is untrusted, and we have a frag_list with 4764 * a linear non head_frag item. 4765 * 4766 * If head_skb's headlen does not fit requested gso_size, 4767 * it means that the frag_list members do NOT terminate 4768 * on exact gso_size boundaries. Hence we cannot perform 4769 * skb_frag_t page sharing. Therefore we must fallback to 4770 * copying the frag_list skbs; we do so by disabling SG. 4771 */ 4772 features &= ~NETIF_F_SG; 4773 break; 4774 } 4775 } 4776 } 4777 4778 __skb_push(head_skb, doffset); 4779 proto = skb_network_protocol(head_skb, NULL); 4780 if (unlikely(!proto)) 4781 return ERR_PTR(-EINVAL); 4782 4783 sg = !!(features & NETIF_F_SG); 4784 csum = !!can_checksum_protocol(features, proto); 4785 4786 if (sg && csum && (mss != GSO_BY_FRAGS)) { 4787 if (!(features & NETIF_F_GSO_PARTIAL)) { 4788 struct sk_buff *iter; 4789 unsigned int frag_len; 4790 4791 if (!list_skb || 4792 !net_gso_ok(features, skb_shinfo(head_skb)->gso_type)) 4793 goto normal; 4794 4795 /* If we get here then all the required 4796 * GSO features except frag_list are supported. 4797 * Try to split the SKB to multiple GSO SKBs 4798 * with no frag_list. 4799 * Currently we can do that only when the buffers don't 4800 * have a linear part and all the buffers except 4801 * the last are of the same length. 4802 */ 4803 frag_len = list_skb->len; 4804 skb_walk_frags(head_skb, iter) { 4805 if (frag_len != iter->len && iter->next) 4806 goto normal; 4807 if (skb_headlen(iter) && !iter->head_frag) 4808 goto normal; 4809 4810 len -= iter->len; 4811 } 4812 4813 if (len != frag_len) 4814 goto normal; 4815 } 4816 4817 /* GSO partial only requires that we trim off any excess that 4818 * doesn't fit into an MSS sized block, so take care of that 4819 * now. 4820 * Cap len to not accidentally hit GSO_BY_FRAGS. 4821 */ 4822 partial_segs = min(len, GSO_BY_FRAGS - 1) / mss; 4823 if (partial_segs > 1) 4824 mss *= partial_segs; 4825 else 4826 partial_segs = 0; 4827 } 4828 4829normal: 4830 headroom = skb_headroom(head_skb); 4831 pos = skb_headlen(head_skb); 4832 4833 if (skb_orphan_frags(head_skb, GFP_ATOMIC)) 4834 return ERR_PTR(-ENOMEM); 4835 4836 nfrags = skb_shinfo(head_skb)->nr_frags; 4837 frag = skb_shinfo(head_skb)->frags; 4838 frag_skb = head_skb; 4839 4840 do { 4841 struct sk_buff *nskb; 4842 skb_frag_t *nskb_frag; 4843 int hsize; 4844 int size; 4845 4846 if (unlikely(mss == GSO_BY_FRAGS)) { 4847 len = list_skb->len; 4848 } else { 4849 len = head_skb->len - offset; 4850 if (len > mss) 4851 len = mss; 4852 } 4853 4854 hsize = skb_headlen(head_skb) - offset; 4855 4856 if (hsize <= 0 && i >= nfrags && skb_headlen(list_skb) && 4857 (skb_headlen(list_skb) == len || sg)) { 4858 BUG_ON(skb_headlen(list_skb) > len); 4859 4860 nskb = skb_clone(list_skb, GFP_ATOMIC); 4861 if (unlikely(!nskb)) 4862 goto err; 4863 4864 i = 0; 4865 nfrags = skb_shinfo(list_skb)->nr_frags; 4866 frag = skb_shinfo(list_skb)->frags; 4867 frag_skb = list_skb; 4868 pos += skb_headlen(list_skb); 4869 4870 while (pos < offset + len) { 4871 BUG_ON(i >= nfrags); 4872 4873 size = skb_frag_size(frag); 4874 if (pos + size > offset + len) 4875 break; 4876 4877 i++; 4878 pos += size; 4879 frag++; 4880 } 4881 4882 list_skb = list_skb->next; 4883 4884 if (unlikely(pskb_trim(nskb, len))) { 4885 kfree_skb(nskb); 4886 goto err; 4887 } 4888 4889 hsize = skb_end_offset(nskb); 4890 if (skb_cow_head(nskb, doffset + headroom)) { 4891 kfree_skb(nskb); 4892 goto err; 4893 } 4894 4895 nskb->truesize += skb_end_offset(nskb) - hsize; 4896 skb_release_head_state(nskb); 4897 __skb_push(nskb, doffset); 4898 } else { 4899 if (hsize < 0) 4900 hsize = 0; 4901 if (hsize > len || !sg) 4902 hsize = len; 4903 4904 nskb = __alloc_skb(hsize + doffset + headroom, 4905 GFP_ATOMIC, skb_alloc_rx_flag(head_skb), 4906 NUMA_NO_NODE); 4907 4908 if (unlikely(!nskb)) 4909 goto err; 4910 4911 skb_reserve(nskb, headroom); 4912 __skb_put(nskb, doffset); 4913 } 4914 4915 if (segs) 4916 tail->next = nskb; 4917 else 4918 segs = nskb; 4919 tail = nskb; 4920 4921 __copy_skb_header(nskb, head_skb); 4922 4923 skb_headers_offset_update(nskb, skb_headroom(nskb) - headroom); 4924 skb_reset_mac_len(nskb); 4925 4926 skb_copy_from_linear_data_offset(head_skb, -tnl_hlen, 4927 nskb->data - tnl_hlen, 4928 doffset + tnl_hlen); 4929 4930 if (nskb->len == len + doffset) 4931 goto perform_csum_check; 4932 4933 if (!sg) { 4934 if (!csum) { 4935 if (!nskb->remcsum_offload) 4936 nskb->ip_summed = CHECKSUM_NONE; 4937 SKB_GSO_CB(nskb)->csum = 4938 skb_copy_and_csum_bits(head_skb, offset, 4939 skb_put(nskb, 4940 len), 4941 len); 4942 SKB_GSO_CB(nskb)->csum_start = 4943 skb_headroom(nskb) + doffset; 4944 } else { 4945 if (skb_copy_bits(head_skb, offset, skb_put(nskb, len), len)) 4946 goto err; 4947 } 4948 continue; 4949 } 4950 4951 nskb_frag = skb_shinfo(nskb)->frags; 4952 4953 skb_copy_from_linear_data_offset(head_skb, offset, 4954 skb_put(nskb, hsize), hsize); 4955 4956 skb_shinfo(nskb)->flags |= skb_shinfo(head_skb)->flags & 4957 SKBFL_SHARED_FRAG; 4958 4959 if (skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC)) 4960 goto err; 4961 4962 while (pos < offset + len) { 4963 if (i >= nfrags) { 4964 if (skb_orphan_frags(list_skb, GFP_ATOMIC) || 4965 skb_zerocopy_clone(nskb, list_skb, 4966 GFP_ATOMIC)) 4967 goto err; 4968 4969 i = 0; 4970 nfrags = skb_shinfo(list_skb)->nr_frags; 4971 frag = skb_shinfo(list_skb)->frags; 4972 frag_skb = list_skb; 4973 if (!skb_headlen(list_skb)) { 4974 BUG_ON(!nfrags); 4975 } else { 4976 BUG_ON(!list_skb->head_frag); 4977 4978 /* to make room for head_frag. */ 4979 i--; 4980 frag--; 4981 } 4982 4983 list_skb = list_skb->next; 4984 } 4985 4986 if (unlikely(skb_shinfo(nskb)->nr_frags >= 4987 MAX_SKB_FRAGS)) { 4988 net_warn_ratelimited( 4989 "skb_segment: too many frags: %u %u\n", 4990 pos, mss); 4991 err = -EINVAL; 4992 goto err; 4993 } 4994 4995 *nskb_frag = (i < 0) ? skb_head_frag_to_page_desc(frag_skb) : *frag; 4996 __skb_frag_ref(nskb_frag); 4997 size = skb_frag_size(nskb_frag); 4998 4999 if (pos < offset) { 5000 skb_frag_off_add(nskb_frag, offset - pos); 5001 skb_frag_size_sub(nskb_frag, offset - pos); 5002 } 5003 5004 skb_shinfo(nskb)->nr_frags++; 5005 5006 if (pos + size <= offset + len) { 5007 i++; 5008 frag++; 5009 pos += size; 5010 } else { 5011 skb_frag_size_sub(nskb_frag, pos + size - (offset + len)); 5012 goto skip_fraglist; 5013 } 5014 5015 nskb_frag++; 5016 } 5017 5018skip_fraglist: 5019 nskb->data_len = len - hsize; 5020 nskb->len += nskb->data_len; 5021 nskb->truesize += nskb->data_len; 5022 5023perform_csum_check: 5024 if (!csum) { 5025 if (skb_has_shared_frag(nskb) && 5026 __skb_linearize(nskb)) 5027 goto err; 5028 5029 if (!nskb->remcsum_offload) 5030 nskb->ip_summed = CHECKSUM_NONE; 5031 SKB_GSO_CB(nskb)->csum = 5032 skb_checksum(nskb, doffset, 5033 nskb->len - doffset, 0); 5034 SKB_GSO_CB(nskb)->csum_start = 5035 skb_headroom(nskb) + doffset; 5036 } 5037 } while ((offset += len) < head_skb->len); 5038 5039 /* Some callers want to get the end of the list. 5040 * Put it in segs->prev to avoid walking the list. 5041 * (see validate_xmit_skb_list() for example) 5042 */ 5043 segs->prev = tail; 5044 5045 if (partial_segs) { 5046 struct sk_buff *iter; 5047 int type = skb_shinfo(head_skb)->gso_type; 5048 unsigned short gso_size = skb_shinfo(head_skb)->gso_size; 5049 5050 /* Update type to add partial and then remove dodgy if set */ 5051 type |= (features & NETIF_F_GSO_PARTIAL) / NETIF_F_GSO_PARTIAL * SKB_GSO_PARTIAL; 5052 type &= ~SKB_GSO_DODGY; 5053 5054 /* Update GSO info and prepare to start updating headers on 5055 * our way back down the stack of protocols. 5056 */ 5057 for (iter = segs; iter; iter = iter->next) { 5058 skb_shinfo(iter)->gso_size = gso_size; 5059 skb_shinfo(iter)->gso_segs = partial_segs; 5060 skb_shinfo(iter)->gso_type = type; 5061 SKB_GSO_CB(iter)->data_offset = skb_headroom(iter) + doffset; 5062 } 5063 5064 if (tail->len - doffset <= gso_size) 5065 skb_shinfo(tail)->gso_size = 0; 5066 else if (tail != segs) 5067 skb_shinfo(tail)->gso_segs = DIV_ROUND_UP(tail->len - doffset, gso_size); 5068 } 5069 5070 /* Following permits correct backpressure, for protocols 5071 * using skb_set_owner_w(). 5072 * Idea is to tranfert ownership from head_skb to last segment. 5073 */ 5074 if (head_skb->destructor == sock_wfree) { 5075 swap(tail->truesize, head_skb->truesize); 5076 swap(tail->destructor, head_skb->destructor); 5077 swap(tail->sk, head_skb->sk); 5078 } 5079 return segs; 5080 5081err: 5082 kfree_skb_list(segs); 5083 return ERR_PTR(err); 5084} 5085EXPORT_SYMBOL_GPL(skb_segment); 5086 5087#ifdef CONFIG_SKB_EXTENSIONS 5088#define SKB_EXT_ALIGN_VALUE 8 5089#define SKB_EXT_CHUNKSIZEOF(x) (ALIGN((sizeof(x)), SKB_EXT_ALIGN_VALUE) / SKB_EXT_ALIGN_VALUE) 5090 5091static const u8 skb_ext_type_len[] = { 5092#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) 5093 [SKB_EXT_BRIDGE_NF] = SKB_EXT_CHUNKSIZEOF(struct nf_bridge_info), 5094#endif 5095#ifdef CONFIG_XFRM 5096 [SKB_EXT_SEC_PATH] = SKB_EXT_CHUNKSIZEOF(struct sec_path), 5097#endif 5098#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) 5099 [TC_SKB_EXT] = SKB_EXT_CHUNKSIZEOF(struct tc_skb_ext), 5100#endif 5101#if IS_ENABLED(CONFIG_MPTCP) 5102 [SKB_EXT_MPTCP] = SKB_EXT_CHUNKSIZEOF(struct mptcp_ext), 5103#endif 5104#if IS_ENABLED(CONFIG_MCTP_FLOWS) 5105 [SKB_EXT_MCTP] = SKB_EXT_CHUNKSIZEOF(struct mctp_flow), 5106#endif 5107#if IS_ENABLED(CONFIG_INET_PSP) 5108 [SKB_EXT_PSP] = SKB_EXT_CHUNKSIZEOF(struct psp_skb_ext), 5109#endif 5110}; 5111 5112static __always_inline unsigned int skb_ext_total_length(void) 5113{ 5114 unsigned int l = SKB_EXT_CHUNKSIZEOF(struct skb_ext); 5115 int i; 5116 5117 for (i = 0; i < ARRAY_SIZE(skb_ext_type_len); i++) 5118 l += skb_ext_type_len[i]; 5119 5120 return l; 5121} 5122 5123static void skb_extensions_init(void) 5124{ 5125 BUILD_BUG_ON(SKB_EXT_NUM >= 8); 5126#if !IS_ENABLED(CONFIG_KCOV_INSTRUMENT_ALL) 5127 BUILD_BUG_ON(skb_ext_total_length() > 255); 5128#endif 5129 5130 skbuff_ext_cache = kmem_cache_create("skbuff_ext_cache", 5131 SKB_EXT_ALIGN_VALUE * skb_ext_total_length(), 5132 0, 5133 SLAB_HWCACHE_ALIGN|SLAB_PANIC, 5134 NULL); 5135} 5136#else 5137static void skb_extensions_init(void) {} 5138#endif 5139 5140/* The SKB kmem_cache slab is critical for network performance. Never 5141 * merge/alias the slab with similar sized objects. This avoids fragmentation 5142 * that hurts performance of kmem_cache_{alloc,free}_bulk APIs. 5143 */ 5144#ifndef CONFIG_SLUB_TINY 5145#define FLAG_SKB_NO_MERGE SLAB_NO_MERGE 5146#else /* CONFIG_SLUB_TINY - simple loop in kmem_cache_alloc_bulk */ 5147#define FLAG_SKB_NO_MERGE 0 5148#endif 5149 5150void __init skb_init(void) 5151{ 5152 net_hotdata.skbuff_cache = kmem_cache_create_usercopy("skbuff_head_cache", 5153 sizeof(struct sk_buff), 5154 0, 5155 SLAB_HWCACHE_ALIGN|SLAB_PANIC| 5156 FLAG_SKB_NO_MERGE, 5157 offsetof(struct sk_buff, cb), 5158 sizeof_field(struct sk_buff, cb), 5159 NULL); 5160 skbuff_cache_size = kmem_cache_size(net_hotdata.skbuff_cache); 5161 5162 net_hotdata.skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache", 5163 sizeof(struct sk_buff_fclones), 5164 0, 5165 SLAB_HWCACHE_ALIGN|SLAB_PANIC, 5166 NULL); 5167 /* usercopy should only access first SKB_SMALL_HEAD_HEADROOM bytes. 5168 * struct skb_shared_info is located at the end of skb->head, 5169 * and should not be copied to/from user. 5170 */ 5171 net_hotdata.skb_small_head_cache = kmem_cache_create_usercopy("skbuff_small_head", 5172 SKB_SMALL_HEAD_CACHE_SIZE, 5173 0, 5174 SLAB_HWCACHE_ALIGN | SLAB_PANIC, 5175 0, 5176 SKB_SMALL_HEAD_HEADROOM, 5177 NULL); 5178 skb_extensions_init(); 5179} 5180 5181static int 5182__skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len, 5183 unsigned int recursion_level) 5184{ 5185 int start = skb_headlen(skb); 5186 int i, copy = start - offset; 5187 struct sk_buff *frag_iter; 5188 int elt = 0; 5189 5190 if (unlikely(recursion_level >= 24)) 5191 return -EMSGSIZE; 5192 5193 if (copy > 0) { 5194 if (copy > len) 5195 copy = len; 5196 sg_set_buf(sg, skb->data + offset, copy); 5197 elt++; 5198 if ((len -= copy) == 0) 5199 return elt; 5200 offset += copy; 5201 } 5202 5203 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 5204 int end; 5205 5206 WARN_ON(start > offset + len); 5207 5208 end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]); 5209 if ((copy = end - offset) > 0) { 5210 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 5211 if (unlikely(elt && sg_is_last(&sg[elt - 1]))) 5212 return -EMSGSIZE; 5213 5214 if (copy > len) 5215 copy = len; 5216 sg_set_page(&sg[elt], skb_frag_page(frag), copy, 5217 skb_frag_off(frag) + offset - start); 5218 elt++; 5219 if (!(len -= copy)) 5220 return elt; 5221 offset += copy; 5222 } 5223 start = end; 5224 } 5225 5226 skb_walk_frags(skb, frag_iter) { 5227 int end, ret; 5228 5229 WARN_ON(start > offset + len); 5230 5231 end = start + frag_iter->len; 5232 if ((copy = end - offset) > 0) { 5233 if (unlikely(elt && sg_is_last(&sg[elt - 1]))) 5234 return -EMSGSIZE; 5235 5236 if (copy > len) 5237 copy = len; 5238 ret = __skb_to_sgvec(frag_iter, sg+elt, offset - start, 5239 copy, recursion_level + 1); 5240 if (unlikely(ret < 0)) 5241 return ret; 5242 elt += ret; 5243 if ((len -= copy) == 0) 5244 return elt; 5245 offset += copy; 5246 } 5247 start = end; 5248 } 5249 BUG_ON(len); 5250 return elt; 5251} 5252 5253/** 5254 * skb_to_sgvec - Fill a scatter-gather list from a socket buffer 5255 * @skb: Socket buffer containing the buffers to be mapped 5256 * @sg: The scatter-gather list to map into 5257 * @offset: The offset into the buffer's contents to start mapping 5258 * @len: Length of buffer space to be mapped 5259 * 5260 * Fill the specified scatter-gather list with mappings/pointers into a 5261 * region of the buffer space attached to a socket buffer. Returns either 5262 * the number of scatterlist items used, or -EMSGSIZE if the contents 5263 * could not fit. 5264 */ 5265int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) 5266{ 5267 int nsg = __skb_to_sgvec(skb, sg, offset, len, 0); 5268 5269 if (nsg <= 0) 5270 return nsg; 5271 5272 sg_mark_end(&sg[nsg - 1]); 5273 5274 return nsg; 5275} 5276EXPORT_SYMBOL_GPL(skb_to_sgvec); 5277 5278/* As compared with skb_to_sgvec, skb_to_sgvec_nomark only map skb to given 5279 * sglist without mark the sg which contain last skb data as the end. 5280 * So the caller can mannipulate sg list as will when padding new data after 5281 * the first call without calling sg_unmark_end to expend sg list. 5282 * 5283 * Scenario to use skb_to_sgvec_nomark: 5284 * 1. sg_init_table 5285 * 2. skb_to_sgvec_nomark(payload1) 5286 * 3. skb_to_sgvec_nomark(payload2) 5287 * 5288 * This is equivalent to: 5289 * 1. sg_init_table 5290 * 2. skb_to_sgvec(payload1) 5291 * 3. sg_unmark_end 5292 * 4. skb_to_sgvec(payload2) 5293 * 5294 * When mapping multiple payload conditionally, skb_to_sgvec_nomark 5295 * is more preferable. 5296 */ 5297int skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg, 5298 int offset, int len) 5299{ 5300 return __skb_to_sgvec(skb, sg, offset, len, 0); 5301} 5302EXPORT_SYMBOL_GPL(skb_to_sgvec_nomark); 5303 5304 5305 5306/** 5307 * skb_cow_data - Check that a socket buffer's data buffers are writable 5308 * @skb: The socket buffer to check. 5309 * @tailbits: Amount of trailing space to be added 5310 * @trailer: Returned pointer to the skb where the @tailbits space begins 5311 * 5312 * Make sure that the data buffers attached to a socket buffer are 5313 * writable. If they are not, private copies are made of the data buffers 5314 * and the socket buffer is set to use these instead. 5315 * 5316 * If @tailbits is given, make sure that there is space to write @tailbits 5317 * bytes of data beyond current end of socket buffer. @trailer will be 5318 * set to point to the skb in which this space begins. 5319 * 5320 * The number of scatterlist elements required to completely map the 5321 * COW'd and extended socket buffer will be returned. 5322 */ 5323int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer) 5324{ 5325 int copyflag; 5326 int elt; 5327 struct sk_buff *skb1, **skb_p; 5328 5329 /* If skb is cloned or its head is paged, reallocate 5330 * head pulling out all the pages (pages are considered not writable 5331 * at the moment even if they are anonymous). 5332 */ 5333 if ((skb_cloned(skb) || skb_shinfo(skb)->nr_frags) && 5334 !__pskb_pull_tail(skb, __skb_pagelen(skb))) 5335 return -ENOMEM; 5336 5337 /* Easy case. Most of packets will go this way. */ 5338 if (!skb_has_frag_list(skb)) { 5339 /* A little of trouble, not enough of space for trailer. 5340 * This should not happen, when stack is tuned to generate 5341 * good frames. OK, on miss we reallocate and reserve even more 5342 * space, 128 bytes is fair. */ 5343 5344 if (skb_tailroom(skb) < tailbits && 5345 pskb_expand_head(skb, 0, tailbits-skb_tailroom(skb)+128, GFP_ATOMIC)) 5346 return -ENOMEM; 5347 5348 /* Voila! */ 5349 *trailer = skb; 5350 return 1; 5351 } 5352 5353 /* Misery. We are in troubles, going to mincer fragments... */ 5354 5355 elt = 1; 5356 skb_p = &skb_shinfo(skb)->frag_list; 5357 copyflag = 0; 5358 5359 while ((skb1 = *skb_p) != NULL) { 5360 int ntail = 0; 5361 5362 /* The fragment is partially pulled by someone, 5363 * this can happen on input. Copy it and everything 5364 * after it. */ 5365 5366 if (skb_shared(skb1)) 5367 copyflag = 1; 5368 5369 /* If the skb is the last, worry about trailer. */ 5370 5371 if (skb1->next == NULL && tailbits) { 5372 if (skb_shinfo(skb1)->nr_frags || 5373 skb_has_frag_list(skb1) || 5374 skb_tailroom(skb1) < tailbits) 5375 ntail = tailbits + 128; 5376 } 5377 5378 if (copyflag || 5379 skb_cloned(skb1) || 5380 ntail || 5381 skb_shinfo(skb1)->nr_frags || 5382 skb_has_frag_list(skb1)) { 5383 struct sk_buff *skb2; 5384 5385 /* Fuck, we are miserable poor guys... */ 5386 if (ntail == 0) 5387 skb2 = skb_copy(skb1, GFP_ATOMIC); 5388 else 5389 skb2 = skb_copy_expand(skb1, 5390 skb_headroom(skb1), 5391 ntail, 5392 GFP_ATOMIC); 5393 if (unlikely(skb2 == NULL)) 5394 return -ENOMEM; 5395 5396 if (skb1->sk) 5397 skb_set_owner_w(skb2, skb1->sk); 5398 5399 /* Looking around. Are we still alive? 5400 * OK, link new skb, drop old one */ 5401 5402 skb2->next = skb1->next; 5403 *skb_p = skb2; 5404 kfree_skb(skb1); 5405 skb1 = skb2; 5406 } 5407 elt++; 5408 *trailer = skb1; 5409 skb_p = &skb1->next; 5410 } 5411 5412 return elt; 5413} 5414EXPORT_SYMBOL_GPL(skb_cow_data); 5415 5416static void sock_rmem_free(struct sk_buff *skb) 5417{ 5418 struct sock *sk = skb->sk; 5419 5420 atomic_sub(skb->truesize, &sk->sk_rmem_alloc); 5421} 5422 5423static void skb_set_err_queue(struct sk_buff *skb) 5424{ 5425 /* pkt_type of skbs received on local sockets is never PACKET_OUTGOING. 5426 * So, it is safe to (mis)use it to mark skbs on the error queue. 5427 */ 5428 skb->pkt_type = PACKET_OUTGOING; 5429 BUILD_BUG_ON(PACKET_OUTGOING == 0); 5430} 5431 5432/* 5433 * Note: We dont mem charge error packets (no sk_forward_alloc changes) 5434 */ 5435int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb) 5436{ 5437 if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= 5438 (unsigned int)READ_ONCE(sk->sk_rcvbuf)) 5439 return -ENOMEM; 5440 5441 skb_orphan(skb); 5442 skb->sk = sk; 5443 skb->destructor = sock_rmem_free; 5444 atomic_add(skb->truesize, &sk->sk_rmem_alloc); 5445 skb_set_err_queue(skb); 5446 5447 /* before exiting rcu section, make sure dst is refcounted */ 5448 skb_dst_force(skb); 5449 5450 skb_queue_tail(&sk->sk_error_queue, skb); 5451 if (!sock_flag(sk, SOCK_DEAD)) 5452 sk_error_report(sk); 5453 return 0; 5454} 5455EXPORT_SYMBOL(sock_queue_err_skb); 5456 5457static bool is_icmp_err_skb(const struct sk_buff *skb) 5458{ 5459 return skb && (SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP || 5460 SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP6); 5461} 5462 5463struct sk_buff *sock_dequeue_err_skb(struct sock *sk) 5464{ 5465 struct sk_buff_head *q = &sk->sk_error_queue; 5466 struct sk_buff *skb, *skb_next = NULL; 5467 bool icmp_next = false; 5468 unsigned long flags; 5469 5470 if (skb_queue_empty_lockless(q)) 5471 return NULL; 5472 5473 spin_lock_irqsave(&q->lock, flags); 5474 skb = __skb_dequeue(q); 5475 if (skb && (skb_next = skb_peek(q))) { 5476 icmp_next = is_icmp_err_skb(skb_next); 5477 if (icmp_next) 5478 sk->sk_err = SKB_EXT_ERR(skb_next)->ee.ee_errno; 5479 } 5480 spin_unlock_irqrestore(&q->lock, flags); 5481 5482 if (is_icmp_err_skb(skb) && !icmp_next) 5483 sk->sk_err = 0; 5484 5485 if (skb_next) 5486 sk_error_report(sk); 5487 5488 return skb; 5489} 5490EXPORT_SYMBOL(sock_dequeue_err_skb); 5491 5492/** 5493 * skb_clone_sk - create clone of skb, and take reference to socket 5494 * @skb: the skb to clone 5495 * 5496 * This function creates a clone of a buffer that holds a reference on 5497 * sk_refcnt. Buffers created via this function are meant to be 5498 * returned using sock_queue_err_skb, or free via kfree_skb. 5499 * 5500 * When passing buffers allocated with this function to sock_queue_err_skb 5501 * it is necessary to wrap the call with sock_hold/sock_put in order to 5502 * prevent the socket from being released prior to being enqueued on 5503 * the sk_error_queue. 5504 */ 5505struct sk_buff *skb_clone_sk(struct sk_buff *skb) 5506{ 5507 struct sock *sk = skb->sk; 5508 struct sk_buff *clone; 5509 5510 if (!sk || !refcount_inc_not_zero(&sk->sk_refcnt)) 5511 return NULL; 5512 5513 clone = skb_clone(skb, GFP_ATOMIC); 5514 if (!clone) { 5515 sock_put(sk); 5516 return NULL; 5517 } 5518 5519 clone->sk = sk; 5520 clone->destructor = sock_efree; 5521 5522 return clone; 5523} 5524EXPORT_SYMBOL(skb_clone_sk); 5525 5526static void __skb_complete_tx_timestamp(struct sk_buff *skb, 5527 struct sock *sk, 5528 int tstype, 5529 bool opt_stats) 5530{ 5531 struct sock_exterr_skb *serr; 5532 int err; 5533 5534 BUILD_BUG_ON(sizeof(struct sock_exterr_skb) > sizeof(skb->cb)); 5535 5536 serr = SKB_EXT_ERR(skb); 5537 memset(serr, 0, sizeof(*serr)); 5538 serr->ee.ee_errno = ENOMSG; 5539 serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; 5540 serr->ee.ee_info = tstype; 5541 serr->opt_stats = opt_stats; 5542 serr->header.h4.iif = skb->dev ? skb->dev->ifindex : 0; 5543 if (READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) { 5544 serr->ee.ee_data = skb_shinfo(skb)->tskey; 5545 if (sk_is_tcp(sk)) 5546 serr->ee.ee_data -= atomic_read(&sk->sk_tskey); 5547 } 5548 5549 err = sock_queue_err_skb(sk, skb); 5550 5551 if (err) 5552 kfree_skb(skb); 5553} 5554 5555static bool skb_may_tx_timestamp(struct sock *sk, bool tsonly) 5556{ 5557 bool ret; 5558 5559 if (likely(tsonly || READ_ONCE(sock_net(sk)->core.sysctl_tstamp_allow_data))) 5560 return true; 5561 5562 read_lock_bh(&sk->sk_callback_lock); 5563 ret = sk->sk_socket && sk->sk_socket->file && 5564 file_ns_capable(sk->sk_socket->file, &init_user_ns, CAP_NET_RAW); 5565 read_unlock_bh(&sk->sk_callback_lock); 5566 return ret; 5567} 5568 5569void skb_complete_tx_timestamp(struct sk_buff *skb, 5570 struct skb_shared_hwtstamps *hwtstamps) 5571{ 5572 struct sock *sk = skb->sk; 5573 5574 if (!skb_may_tx_timestamp(sk, false)) 5575 goto err; 5576 5577 /* Take a reference to prevent skb_orphan() from freeing the socket, 5578 * but only if the socket refcount is not zero. 5579 */ 5580 if (likely(refcount_inc_not_zero(&sk->sk_refcnt))) { 5581 *skb_hwtstamps(skb) = *hwtstamps; 5582 __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND, false); 5583 sock_put(sk); 5584 return; 5585 } 5586 5587err: 5588 kfree_skb(skb); 5589} 5590EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp); 5591 5592static bool skb_tstamp_tx_report_so_timestamping(struct sk_buff *skb, 5593 struct skb_shared_hwtstamps *hwtstamps, 5594 int tstype) 5595{ 5596 switch (tstype) { 5597 case SCM_TSTAMP_SCHED: 5598 return skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP; 5599 case SCM_TSTAMP_SND: 5600 return skb_shinfo(skb)->tx_flags & (hwtstamps ? SKBTX_HW_TSTAMP_NOBPF : 5601 SKBTX_SW_TSTAMP); 5602 case SCM_TSTAMP_ACK: 5603 return TCP_SKB_CB(skb)->txstamp_ack & TSTAMP_ACK_SK; 5604 case SCM_TSTAMP_COMPLETION: 5605 return skb_shinfo(skb)->tx_flags & SKBTX_COMPLETION_TSTAMP; 5606 } 5607 5608 return false; 5609} 5610 5611static void skb_tstamp_tx_report_bpf_timestamping(struct sk_buff *skb, 5612 struct skb_shared_hwtstamps *hwtstamps, 5613 struct sock *sk, 5614 int tstype) 5615{ 5616 int op; 5617 5618 switch (tstype) { 5619 case SCM_TSTAMP_SCHED: 5620 op = BPF_SOCK_OPS_TSTAMP_SCHED_CB; 5621 break; 5622 case SCM_TSTAMP_SND: 5623 if (hwtstamps) { 5624 op = BPF_SOCK_OPS_TSTAMP_SND_HW_CB; 5625 *skb_hwtstamps(skb) = *hwtstamps; 5626 } else { 5627 op = BPF_SOCK_OPS_TSTAMP_SND_SW_CB; 5628 } 5629 break; 5630 case SCM_TSTAMP_ACK: 5631 op = BPF_SOCK_OPS_TSTAMP_ACK_CB; 5632 break; 5633 default: 5634 return; 5635 } 5636 5637 bpf_skops_tx_timestamping(sk, skb, op); 5638} 5639 5640void __skb_tstamp_tx(struct sk_buff *orig_skb, 5641 const struct sk_buff *ack_skb, 5642 struct skb_shared_hwtstamps *hwtstamps, 5643 struct sock *sk, int tstype) 5644{ 5645 struct sk_buff *skb; 5646 bool tsonly, opt_stats = false; 5647 u32 tsflags; 5648 5649 if (!sk) 5650 return; 5651 5652 if (skb_shinfo(orig_skb)->tx_flags & SKBTX_BPF) 5653 skb_tstamp_tx_report_bpf_timestamping(orig_skb, hwtstamps, 5654 sk, tstype); 5655 5656 if (!skb_tstamp_tx_report_so_timestamping(orig_skb, hwtstamps, tstype)) 5657 return; 5658 5659 tsflags = READ_ONCE(sk->sk_tsflags); 5660 if (!hwtstamps && !(tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) && 5661 skb_shinfo(orig_skb)->tx_flags & SKBTX_IN_PROGRESS) 5662 return; 5663 5664 tsonly = tsflags & SOF_TIMESTAMPING_OPT_TSONLY; 5665 if (!skb_may_tx_timestamp(sk, tsonly)) 5666 return; 5667 5668 if (tsonly) { 5669#ifdef CONFIG_INET 5670 if ((tsflags & SOF_TIMESTAMPING_OPT_STATS) && 5671 sk_is_tcp(sk)) { 5672 skb = tcp_get_timestamping_opt_stats(sk, orig_skb, 5673 ack_skb); 5674 opt_stats = true; 5675 } else 5676#endif 5677 skb = alloc_skb(0, GFP_ATOMIC); 5678 } else { 5679 skb = skb_clone(orig_skb, GFP_ATOMIC); 5680 5681 if (skb_orphan_frags_rx(skb, GFP_ATOMIC)) { 5682 kfree_skb(skb); 5683 return; 5684 } 5685 } 5686 if (!skb) 5687 return; 5688 5689 if (tsonly) { 5690 skb_shinfo(skb)->tx_flags |= skb_shinfo(orig_skb)->tx_flags & 5691 SKBTX_ANY_TSTAMP; 5692 skb_shinfo(skb)->tskey = skb_shinfo(orig_skb)->tskey; 5693 } 5694 5695 if (hwtstamps) 5696 *skb_hwtstamps(skb) = *hwtstamps; 5697 else 5698 __net_timestamp(skb); 5699 5700 __skb_complete_tx_timestamp(skb, sk, tstype, opt_stats); 5701} 5702EXPORT_SYMBOL_GPL(__skb_tstamp_tx); 5703 5704void skb_tstamp_tx(struct sk_buff *orig_skb, 5705 struct skb_shared_hwtstamps *hwtstamps) 5706{ 5707 return __skb_tstamp_tx(orig_skb, NULL, hwtstamps, orig_skb->sk, 5708 SCM_TSTAMP_SND); 5709} 5710EXPORT_SYMBOL_GPL(skb_tstamp_tx); 5711 5712#ifdef CONFIG_WIRELESS 5713void skb_complete_wifi_ack(struct sk_buff *skb, bool acked) 5714{ 5715 struct sock *sk = skb->sk; 5716 struct sock_exterr_skb *serr; 5717 int err = 1; 5718 5719 skb->wifi_acked_valid = 1; 5720 skb->wifi_acked = acked; 5721 5722 serr = SKB_EXT_ERR(skb); 5723 memset(serr, 0, sizeof(*serr)); 5724 serr->ee.ee_errno = ENOMSG; 5725 serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS; 5726 5727 /* Take a reference to prevent skb_orphan() from freeing the socket, 5728 * but only if the socket refcount is not zero. 5729 */ 5730 if (likely(refcount_inc_not_zero(&sk->sk_refcnt))) { 5731 err = sock_queue_err_skb(sk, skb); 5732 sock_put(sk); 5733 } 5734 if (err) 5735 kfree_skb(skb); 5736} 5737EXPORT_SYMBOL_GPL(skb_complete_wifi_ack); 5738#endif /* CONFIG_WIRELESS */ 5739 5740/** 5741 * skb_partial_csum_set - set up and verify partial csum values for packet 5742 * @skb: the skb to set 5743 * @start: the number of bytes after skb->data to start checksumming. 5744 * @off: the offset from start to place the checksum. 5745 * 5746 * For untrusted partially-checksummed packets, we need to make sure the values 5747 * for skb->csum_start and skb->csum_offset are valid so we don't oops. 5748 * 5749 * This function checks and sets those values and skb->ip_summed: if this 5750 * returns false you should drop the packet. 5751 */ 5752bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off) 5753{ 5754 u32 csum_end = (u32)start + (u32)off + sizeof(__sum16); 5755 u32 csum_start = skb_headroom(skb) + (u32)start; 5756 5757 if (unlikely(csum_start >= U16_MAX || csum_end > skb_headlen(skb))) { 5758 net_warn_ratelimited("bad partial csum: csum=%u/%u headroom=%u headlen=%u\n", 5759 start, off, skb_headroom(skb), skb_headlen(skb)); 5760 return false; 5761 } 5762 skb->ip_summed = CHECKSUM_PARTIAL; 5763 skb->csum_start = csum_start; 5764 skb->csum_offset = off; 5765 skb->transport_header = csum_start; 5766 return true; 5767} 5768EXPORT_SYMBOL_GPL(skb_partial_csum_set); 5769 5770static int skb_maybe_pull_tail(struct sk_buff *skb, unsigned int len, 5771 unsigned int max) 5772{ 5773 if (skb_headlen(skb) >= len) 5774 return 0; 5775 5776 /* If we need to pullup then pullup to the max, so we 5777 * won't need to do it again. 5778 */ 5779 if (max > skb->len) 5780 max = skb->len; 5781 5782 if (__pskb_pull_tail(skb, max - skb_headlen(skb)) == NULL) 5783 return -ENOMEM; 5784 5785 if (skb_headlen(skb) < len) 5786 return -EPROTO; 5787 5788 return 0; 5789} 5790 5791#define MAX_TCP_HDR_LEN (15 * 4) 5792 5793static __sum16 *skb_checksum_setup_ip(struct sk_buff *skb, 5794 typeof(IPPROTO_IP) proto, 5795 unsigned int off) 5796{ 5797 int err; 5798 5799 switch (proto) { 5800 case IPPROTO_TCP: 5801 err = skb_maybe_pull_tail(skb, off + sizeof(struct tcphdr), 5802 off + MAX_TCP_HDR_LEN); 5803 if (!err && !skb_partial_csum_set(skb, off, 5804 offsetof(struct tcphdr, 5805 check))) 5806 err = -EPROTO; 5807 return err ? ERR_PTR(err) : &tcp_hdr(skb)->check; 5808 5809 case IPPROTO_UDP: 5810 err = skb_maybe_pull_tail(skb, off + sizeof(struct udphdr), 5811 off + sizeof(struct udphdr)); 5812 if (!err && !skb_partial_csum_set(skb, off, 5813 offsetof(struct udphdr, 5814 check))) 5815 err = -EPROTO; 5816 return err ? ERR_PTR(err) : &udp_hdr(skb)->check; 5817 } 5818 5819 return ERR_PTR(-EPROTO); 5820} 5821 5822/* This value should be large enough to cover a tagged ethernet header plus 5823 * maximally sized IP and TCP or UDP headers. 5824 */ 5825#define MAX_IP_HDR_LEN 128 5826 5827static int skb_checksum_setup_ipv4(struct sk_buff *skb, bool recalculate) 5828{ 5829 unsigned int off; 5830 bool fragment; 5831 __sum16 *csum; 5832 int err; 5833 5834 fragment = false; 5835 5836 err = skb_maybe_pull_tail(skb, 5837 sizeof(struct iphdr), 5838 MAX_IP_HDR_LEN); 5839 if (err < 0) 5840 goto out; 5841 5842 if (ip_is_fragment(ip_hdr(skb))) 5843 fragment = true; 5844 5845 off = ip_hdrlen(skb); 5846 5847 err = -EPROTO; 5848 5849 if (fragment) 5850 goto out; 5851 5852 csum = skb_checksum_setup_ip(skb, ip_hdr(skb)->protocol, off); 5853 if (IS_ERR(csum)) 5854 return PTR_ERR(csum); 5855 5856 if (recalculate) 5857 *csum = ~csum_tcpudp_magic(ip_hdr(skb)->saddr, 5858 ip_hdr(skb)->daddr, 5859 skb->len - off, 5860 ip_hdr(skb)->protocol, 0); 5861 err = 0; 5862 5863out: 5864 return err; 5865} 5866 5867/* This value should be large enough to cover a tagged ethernet header plus 5868 * an IPv6 header, all options, and a maximal TCP or UDP header. 5869 */ 5870#define MAX_IPV6_HDR_LEN 256 5871 5872#define OPT_HDR(type, skb, off) \ 5873 (type *)(skb_network_header(skb) + (off)) 5874 5875static int skb_checksum_setup_ipv6(struct sk_buff *skb, bool recalculate) 5876{ 5877 int err; 5878 u8 nexthdr; 5879 unsigned int off; 5880 unsigned int len; 5881 bool fragment; 5882 bool done; 5883 __sum16 *csum; 5884 5885 fragment = false; 5886 done = false; 5887 5888 off = sizeof(struct ipv6hdr); 5889 5890 err = skb_maybe_pull_tail(skb, off, MAX_IPV6_HDR_LEN); 5891 if (err < 0) 5892 goto out; 5893 5894 nexthdr = ipv6_hdr(skb)->nexthdr; 5895 5896 len = sizeof(struct ipv6hdr) + ntohs(ipv6_hdr(skb)->payload_len); 5897 while (off <= len && !done) { 5898 switch (nexthdr) { 5899 case IPPROTO_DSTOPTS: 5900 case IPPROTO_HOPOPTS: 5901 case IPPROTO_ROUTING: { 5902 struct ipv6_opt_hdr *hp; 5903 5904 err = skb_maybe_pull_tail(skb, 5905 off + 5906 sizeof(struct ipv6_opt_hdr), 5907 MAX_IPV6_HDR_LEN); 5908 if (err < 0) 5909 goto out; 5910 5911 hp = OPT_HDR(struct ipv6_opt_hdr, skb, off); 5912 nexthdr = hp->nexthdr; 5913 off += ipv6_optlen(hp); 5914 break; 5915 } 5916 case IPPROTO_AH: { 5917 struct ip_auth_hdr *hp; 5918 5919 err = skb_maybe_pull_tail(skb, 5920 off + 5921 sizeof(struct ip_auth_hdr), 5922 MAX_IPV6_HDR_LEN); 5923 if (err < 0) 5924 goto out; 5925 5926 hp = OPT_HDR(struct ip_auth_hdr, skb, off); 5927 nexthdr = hp->nexthdr; 5928 off += ipv6_authlen(hp); 5929 break; 5930 } 5931 case IPPROTO_FRAGMENT: { 5932 struct frag_hdr *hp; 5933 5934 err = skb_maybe_pull_tail(skb, 5935 off + 5936 sizeof(struct frag_hdr), 5937 MAX_IPV6_HDR_LEN); 5938 if (err < 0) 5939 goto out; 5940 5941 hp = OPT_HDR(struct frag_hdr, skb, off); 5942 5943 if (hp->frag_off & htons(IP6_OFFSET | IP6_MF)) 5944 fragment = true; 5945 5946 nexthdr = hp->nexthdr; 5947 off += sizeof(struct frag_hdr); 5948 break; 5949 } 5950 default: 5951 done = true; 5952 break; 5953 } 5954 } 5955 5956 err = -EPROTO; 5957 5958 if (!done || fragment) 5959 goto out; 5960 5961 csum = skb_checksum_setup_ip(skb, nexthdr, off); 5962 if (IS_ERR(csum)) 5963 return PTR_ERR(csum); 5964 5965 if (recalculate) 5966 *csum = ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr, 5967 &ipv6_hdr(skb)->daddr, 5968 skb->len - off, nexthdr, 0); 5969 err = 0; 5970 5971out: 5972 return err; 5973} 5974 5975/** 5976 * skb_checksum_setup - set up partial checksum offset 5977 * @skb: the skb to set up 5978 * @recalculate: if true the pseudo-header checksum will be recalculated 5979 */ 5980int skb_checksum_setup(struct sk_buff *skb, bool recalculate) 5981{ 5982 int err; 5983 5984 switch (skb->protocol) { 5985 case htons(ETH_P_IP): 5986 err = skb_checksum_setup_ipv4(skb, recalculate); 5987 break; 5988 5989 case htons(ETH_P_IPV6): 5990 err = skb_checksum_setup_ipv6(skb, recalculate); 5991 break; 5992 5993 default: 5994 err = -EPROTO; 5995 break; 5996 } 5997 5998 return err; 5999} 6000EXPORT_SYMBOL(skb_checksum_setup); 6001 6002/** 6003 * skb_checksum_maybe_trim - maybe trims the given skb 6004 * @skb: the skb to check 6005 * @transport_len: the data length beyond the network header 6006 * 6007 * Checks whether the given skb has data beyond the given transport length. 6008 * If so, returns a cloned skb trimmed to this transport length. 6009 * Otherwise returns the provided skb. Returns NULL in error cases 6010 * (e.g. transport_len exceeds skb length or out-of-memory). 6011 * 6012 * Caller needs to set the skb transport header and free any returned skb if it 6013 * differs from the provided skb. 6014 */ 6015static struct sk_buff *skb_checksum_maybe_trim(struct sk_buff *skb, 6016 unsigned int transport_len) 6017{ 6018 struct sk_buff *skb_chk; 6019 unsigned int len = skb_transport_offset(skb) + transport_len; 6020 int ret; 6021 6022 if (skb->len < len) 6023 return NULL; 6024 else if (skb->len == len) 6025 return skb; 6026 6027 skb_chk = skb_clone(skb, GFP_ATOMIC); 6028 if (!skb_chk) 6029 return NULL; 6030 6031 ret = pskb_trim_rcsum(skb_chk, len); 6032 if (ret) { 6033 kfree_skb(skb_chk); 6034 return NULL; 6035 } 6036 6037 return skb_chk; 6038} 6039 6040/** 6041 * skb_checksum_trimmed - validate checksum of an skb 6042 * @skb: the skb to check 6043 * @transport_len: the data length beyond the network header 6044 * @skb_chkf: checksum function to use 6045 * 6046 * Applies the given checksum function skb_chkf to the provided skb. 6047 * Returns a checked and maybe trimmed skb. Returns NULL on error. 6048 * 6049 * If the skb has data beyond the given transport length, then a 6050 * trimmed & cloned skb is checked and returned. 6051 * 6052 * Caller needs to set the skb transport header and free any returned skb if it 6053 * differs from the provided skb. 6054 */ 6055struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb, 6056 unsigned int transport_len, 6057 __sum16(*skb_chkf)(struct sk_buff *skb)) 6058{ 6059 struct sk_buff *skb_chk; 6060 unsigned int offset = skb_transport_offset(skb); 6061 __sum16 ret; 6062 6063 skb_chk = skb_checksum_maybe_trim(skb, transport_len); 6064 if (!skb_chk) 6065 goto err; 6066 6067 if (!pskb_may_pull(skb_chk, offset)) 6068 goto err; 6069 6070 skb_pull_rcsum(skb_chk, offset); 6071 ret = skb_chkf(skb_chk); 6072 skb_push_rcsum(skb_chk, offset); 6073 6074 if (ret) 6075 goto err; 6076 6077 return skb_chk; 6078 6079err: 6080 if (skb_chk && skb_chk != skb) 6081 kfree_skb(skb_chk); 6082 6083 return NULL; 6084 6085} 6086EXPORT_SYMBOL(skb_checksum_trimmed); 6087 6088void __skb_warn_lro_forwarding(const struct sk_buff *skb) 6089{ 6090 net_warn_ratelimited("%s: received packets cannot be forwarded while LRO is enabled\n", 6091 skb->dev->name); 6092} 6093EXPORT_SYMBOL(__skb_warn_lro_forwarding); 6094 6095void kfree_skb_partial(struct sk_buff *skb, bool head_stolen) 6096{ 6097 if (head_stolen) { 6098 skb_release_head_state(skb); 6099 kmem_cache_free(net_hotdata.skbuff_cache, skb); 6100 } else { 6101 __kfree_skb(skb); 6102 } 6103} 6104EXPORT_SYMBOL(kfree_skb_partial); 6105 6106/** 6107 * skb_try_coalesce - try to merge skb to prior one 6108 * @to: prior buffer 6109 * @from: buffer to add 6110 * @fragstolen: pointer to boolean 6111 * @delta_truesize: how much more was allocated than was requested 6112 */ 6113bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, 6114 bool *fragstolen, int *delta_truesize) 6115{ 6116 struct skb_shared_info *to_shinfo, *from_shinfo; 6117 int i, delta, len = from->len; 6118 6119 *fragstolen = false; 6120 6121 if (skb_cloned(to)) 6122 return false; 6123 6124 /* In general, avoid mixing page_pool and non-page_pool allocated 6125 * pages within the same SKB. In theory we could take full 6126 * references if @from is cloned and !@to->pp_recycle but its 6127 * tricky (due to potential race with the clone disappearing) and 6128 * rare, so not worth dealing with. 6129 */ 6130 if (to->pp_recycle != from->pp_recycle) 6131 return false; 6132 6133 if (skb_frags_readable(from) != skb_frags_readable(to)) 6134 return false; 6135 6136 if (len <= skb_tailroom(to) && skb_frags_readable(from)) { 6137 if (len) 6138 BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len)); 6139 *delta_truesize = 0; 6140 return true; 6141 } 6142 6143 to_shinfo = skb_shinfo(to); 6144 from_shinfo = skb_shinfo(from); 6145 if (to_shinfo->frag_list || from_shinfo->frag_list) 6146 return false; 6147 if (skb_zcopy(to) || skb_zcopy(from)) 6148 return false; 6149 6150 if (skb_headlen(from) != 0) { 6151 struct page *page; 6152 unsigned int offset; 6153 6154 if (to_shinfo->nr_frags + 6155 from_shinfo->nr_frags >= MAX_SKB_FRAGS) 6156 return false; 6157 6158 if (skb_head_is_locked(from)) 6159 return false; 6160 6161 delta = from->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff)); 6162 6163 page = virt_to_head_page(from->head); 6164 offset = from->data - (unsigned char *)page_address(page); 6165 6166 skb_fill_page_desc(to, to_shinfo->nr_frags, 6167 page, offset, skb_headlen(from)); 6168 *fragstolen = true; 6169 } else { 6170 if (to_shinfo->nr_frags + 6171 from_shinfo->nr_frags > MAX_SKB_FRAGS) 6172 return false; 6173 6174 delta = from->truesize - SKB_TRUESIZE(skb_end_offset(from)); 6175 } 6176 6177 WARN_ON_ONCE(delta < len); 6178 6179 memcpy(to_shinfo->frags + to_shinfo->nr_frags, 6180 from_shinfo->frags, 6181 from_shinfo->nr_frags * sizeof(skb_frag_t)); 6182 to_shinfo->nr_frags += from_shinfo->nr_frags; 6183 6184 if (!skb_cloned(from)) 6185 from_shinfo->nr_frags = 0; 6186 6187 /* if the skb is not cloned this does nothing 6188 * since we set nr_frags to 0. 6189 */ 6190 if (skb_pp_frag_ref(from)) { 6191 for (i = 0; i < from_shinfo->nr_frags; i++) 6192 __skb_frag_ref(&from_shinfo->frags[i]); 6193 } 6194 6195 to->truesize += delta; 6196 to->len += len; 6197 to->data_len += len; 6198 6199 *delta_truesize = delta; 6200 return true; 6201} 6202EXPORT_SYMBOL(skb_try_coalesce); 6203 6204/** 6205 * skb_scrub_packet - scrub an skb 6206 * 6207 * @skb: buffer to clean 6208 * @xnet: packet is crossing netns 6209 * 6210 * skb_scrub_packet can be used after encapsulating or decapsulating a packet 6211 * into/from a tunnel. Some information have to be cleared during these 6212 * operations. 6213 * skb_scrub_packet can also be used to clean a skb before injecting it in 6214 * another namespace (@xnet == true). We have to clear all information in the 6215 * skb that could impact namespace isolation. 6216 */ 6217void skb_scrub_packet(struct sk_buff *skb, bool xnet) 6218{ 6219 skb->pkt_type = PACKET_HOST; 6220 skb->skb_iif = 0; 6221 skb->ignore_df = 0; 6222 skb_dst_drop(skb); 6223 skb_ext_reset(skb); 6224 nf_reset_ct(skb); 6225 nf_reset_trace(skb); 6226 6227#ifdef CONFIG_NET_SWITCHDEV 6228 skb->offload_fwd_mark = 0; 6229 skb->offload_l3_fwd_mark = 0; 6230#endif 6231 ipvs_reset(skb); 6232 6233 if (!xnet) 6234 return; 6235 6236 skb->mark = 0; 6237 skb_clear_tstamp(skb); 6238} 6239EXPORT_SYMBOL_GPL(skb_scrub_packet); 6240 6241static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb) 6242{ 6243 int mac_len, meta_len; 6244 void *meta; 6245 6246 if (skb_cow(skb, skb_headroom(skb)) < 0) { 6247 kfree_skb(skb); 6248 return NULL; 6249 } 6250 6251 mac_len = skb->data - skb_mac_header(skb); 6252 if (likely(mac_len > VLAN_HLEN + ETH_TLEN)) { 6253 memmove(skb_mac_header(skb) + VLAN_HLEN, skb_mac_header(skb), 6254 mac_len - VLAN_HLEN - ETH_TLEN); 6255 } 6256 6257 meta_len = skb_metadata_len(skb); 6258 if (meta_len) { 6259 meta = skb_metadata_end(skb) - meta_len; 6260 memmove(meta + VLAN_HLEN, meta, meta_len); 6261 } 6262 6263 skb->mac_header += VLAN_HLEN; 6264 return skb; 6265} 6266 6267struct sk_buff *skb_vlan_untag(struct sk_buff *skb) 6268{ 6269 struct vlan_hdr *vhdr; 6270 u16 vlan_tci; 6271 6272 if (unlikely(skb_vlan_tag_present(skb))) { 6273 /* vlan_tci is already set-up so leave this for another time */ 6274 return skb; 6275 } 6276 6277 skb = skb_share_check(skb, GFP_ATOMIC); 6278 if (unlikely(!skb)) 6279 goto err_free; 6280 /* We may access the two bytes after vlan_hdr in vlan_set_encap_proto(). */ 6281 if (unlikely(!pskb_may_pull(skb, VLAN_HLEN + sizeof(unsigned short)))) 6282 goto err_free; 6283 6284 vhdr = (struct vlan_hdr *)skb->data; 6285 vlan_tci = ntohs(vhdr->h_vlan_TCI); 6286 __vlan_hwaccel_put_tag(skb, skb->protocol, vlan_tci); 6287 6288 skb_pull_rcsum(skb, VLAN_HLEN); 6289 vlan_set_encap_proto(skb, vhdr); 6290 6291 skb = skb_reorder_vlan_header(skb); 6292 if (unlikely(!skb)) 6293 goto err_free; 6294 6295 skb_reset_network_header(skb); 6296 if (!skb_transport_header_was_set(skb)) 6297 skb_reset_transport_header(skb); 6298 skb_reset_mac_len(skb); 6299 6300 return skb; 6301 6302err_free: 6303 kfree_skb(skb); 6304 return NULL; 6305} 6306EXPORT_SYMBOL(skb_vlan_untag); 6307 6308int skb_ensure_writable(struct sk_buff *skb, unsigned int write_len) 6309{ 6310 if (!pskb_may_pull(skb, write_len)) 6311 return -ENOMEM; 6312 6313 if (!skb_cloned(skb) || skb_clone_writable(skb, write_len)) 6314 return 0; 6315 6316 return pskb_expand_head(skb, 0, 0, GFP_ATOMIC); 6317} 6318EXPORT_SYMBOL(skb_ensure_writable); 6319 6320int skb_ensure_writable_head_tail(struct sk_buff *skb, struct net_device *dev) 6321{ 6322 int needed_headroom = dev->needed_headroom; 6323 int needed_tailroom = dev->needed_tailroom; 6324 6325 /* For tail taggers, we need to pad short frames ourselves, to ensure 6326 * that the tail tag does not fail at its role of being at the end of 6327 * the packet, once the conduit interface pads the frame. Account for 6328 * that pad length here, and pad later. 6329 */ 6330 if (unlikely(needed_tailroom && skb->len < ETH_ZLEN)) 6331 needed_tailroom += ETH_ZLEN - skb->len; 6332 /* skb_headroom() returns unsigned int... */ 6333 needed_headroom = max_t(int, needed_headroom - skb_headroom(skb), 0); 6334 needed_tailroom = max_t(int, needed_tailroom - skb_tailroom(skb), 0); 6335 6336 if (likely(!needed_headroom && !needed_tailroom && !skb_cloned(skb))) 6337 /* No reallocation needed, yay! */ 6338 return 0; 6339 6340 return pskb_expand_head(skb, needed_headroom, needed_tailroom, 6341 GFP_ATOMIC); 6342} 6343EXPORT_SYMBOL(skb_ensure_writable_head_tail); 6344 6345/* remove VLAN header from packet and update csum accordingly. 6346 * expects a non skb_vlan_tag_present skb with a vlan tag payload 6347 */ 6348int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci) 6349{ 6350 int offset = skb->data - skb_mac_header(skb); 6351 int err; 6352 6353 if (WARN_ONCE(offset, 6354 "__skb_vlan_pop got skb with skb->data not at mac header (offset %d)\n", 6355 offset)) { 6356 return -EINVAL; 6357 } 6358 6359 err = skb_ensure_writable(skb, VLAN_ETH_HLEN); 6360 if (unlikely(err)) 6361 return err; 6362 6363 skb_postpull_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN); 6364 6365 vlan_remove_tag(skb, vlan_tci); 6366 6367 skb->mac_header += VLAN_HLEN; 6368 6369 if (skb_network_offset(skb) < ETH_HLEN) 6370 skb_set_network_header(skb, ETH_HLEN); 6371 6372 skb_reset_mac_len(skb); 6373 6374 return err; 6375} 6376EXPORT_SYMBOL(__skb_vlan_pop); 6377 6378/* Pop a vlan tag either from hwaccel or from payload. 6379 * Expects skb->data at mac header. 6380 */ 6381int skb_vlan_pop(struct sk_buff *skb) 6382{ 6383 u16 vlan_tci; 6384 __be16 vlan_proto; 6385 int err; 6386 6387 if (likely(skb_vlan_tag_present(skb))) { 6388 __vlan_hwaccel_clear_tag(skb); 6389 } else { 6390 if (unlikely(!eth_type_vlan(skb->protocol))) 6391 return 0; 6392 6393 err = __skb_vlan_pop(skb, &vlan_tci); 6394 if (err) 6395 return err; 6396 } 6397 /* move next vlan tag to hw accel tag */ 6398 if (likely(!eth_type_vlan(skb->protocol))) 6399 return 0; 6400 6401 vlan_proto = skb->protocol; 6402 err = __skb_vlan_pop(skb, &vlan_tci); 6403 if (unlikely(err)) 6404 return err; 6405 6406 __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci); 6407 return 0; 6408} 6409EXPORT_SYMBOL(skb_vlan_pop); 6410 6411/* Push a vlan tag either into hwaccel or into payload (if hwaccel tag present). 6412 * Expects skb->data at mac header. 6413 */ 6414int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) 6415{ 6416 if (skb_vlan_tag_present(skb)) { 6417 int offset = skb->data - skb_mac_header(skb); 6418 int err; 6419 6420 if (WARN_ONCE(offset, 6421 "skb_vlan_push got skb with skb->data not at mac header (offset %d)\n", 6422 offset)) { 6423 return -EINVAL; 6424 } 6425 6426 err = __vlan_insert_tag(skb, skb->vlan_proto, 6427 skb_vlan_tag_get(skb)); 6428 if (err) 6429 return err; 6430 6431 skb->protocol = skb->vlan_proto; 6432 skb->network_header -= VLAN_HLEN; 6433 6434 skb_postpush_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN); 6435 } 6436 __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci); 6437 return 0; 6438} 6439EXPORT_SYMBOL(skb_vlan_push); 6440 6441/** 6442 * skb_eth_pop() - Drop the Ethernet header at the head of a packet 6443 * 6444 * @skb: Socket buffer to modify 6445 * 6446 * Drop the Ethernet header of @skb. 6447 * 6448 * Expects that skb->data points to the mac header and that no VLAN tags are 6449 * present. 6450 * 6451 * Returns 0 on success, -errno otherwise. 6452 */ 6453int skb_eth_pop(struct sk_buff *skb) 6454{ 6455 if (!pskb_may_pull(skb, ETH_HLEN) || skb_vlan_tagged(skb) || 6456 skb_network_offset(skb) < ETH_HLEN) 6457 return -EPROTO; 6458 6459 skb_pull_rcsum(skb, ETH_HLEN); 6460 skb_reset_mac_header(skb); 6461 skb_reset_mac_len(skb); 6462 6463 return 0; 6464} 6465EXPORT_SYMBOL(skb_eth_pop); 6466 6467/** 6468 * skb_eth_push() - Add a new Ethernet header at the head of a packet 6469 * 6470 * @skb: Socket buffer to modify 6471 * @dst: Destination MAC address of the new header 6472 * @src: Source MAC address of the new header 6473 * 6474 * Prepend @skb with a new Ethernet header. 6475 * 6476 * Expects that skb->data points to the mac header, which must be empty. 6477 * 6478 * Returns 0 on success, -errno otherwise. 6479 */ 6480int skb_eth_push(struct sk_buff *skb, const unsigned char *dst, 6481 const unsigned char *src) 6482{ 6483 struct ethhdr *eth; 6484 int err; 6485 6486 if (skb_network_offset(skb) || skb_vlan_tag_present(skb)) 6487 return -EPROTO; 6488 6489 err = skb_cow_head(skb, sizeof(*eth)); 6490 if (err < 0) 6491 return err; 6492 6493 skb_push(skb, sizeof(*eth)); 6494 skb_reset_mac_header(skb); 6495 skb_reset_mac_len(skb); 6496 6497 eth = eth_hdr(skb); 6498 ether_addr_copy(eth->h_dest, dst); 6499 ether_addr_copy(eth->h_source, src); 6500 eth->h_proto = skb->protocol; 6501 6502 skb_postpush_rcsum(skb, eth, sizeof(*eth)); 6503 6504 return 0; 6505} 6506EXPORT_SYMBOL(skb_eth_push); 6507 6508/* Update the ethertype of hdr and the skb csum value if required. */ 6509static void skb_mod_eth_type(struct sk_buff *skb, struct ethhdr *hdr, 6510 __be16 ethertype) 6511{ 6512 if (skb->ip_summed == CHECKSUM_COMPLETE) { 6513 __be16 diff[] = { ~hdr->h_proto, ethertype }; 6514 6515 skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum); 6516 } 6517 6518 hdr->h_proto = ethertype; 6519} 6520 6521/** 6522 * skb_mpls_push() - push a new MPLS header after mac_len bytes from start of 6523 * the packet 6524 * 6525 * @skb: buffer 6526 * @mpls_lse: MPLS label stack entry to push 6527 * @mpls_proto: ethertype of the new MPLS header (expects 0x8847 or 0x8848) 6528 * @mac_len: length of the MAC header 6529 * @ethernet: flag to indicate if the resulting packet after skb_mpls_push is 6530 * ethernet 6531 * 6532 * Expects skb->data at mac header. 6533 * 6534 * Returns 0 on success, -errno otherwise. 6535 */ 6536int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto, 6537 int mac_len, bool ethernet) 6538{ 6539 struct mpls_shim_hdr *lse; 6540 int err; 6541 6542 if (unlikely(!eth_p_mpls(mpls_proto))) 6543 return -EINVAL; 6544 6545 /* Networking stack does not allow simultaneous Tunnel and MPLS GSO. */ 6546 if (skb->encapsulation) 6547 return -EINVAL; 6548 6549 err = skb_cow_head(skb, MPLS_HLEN); 6550 if (unlikely(err)) 6551 return err; 6552 6553 if (!skb->inner_protocol) { 6554 skb_set_inner_network_header(skb, skb_network_offset(skb)); 6555 skb_set_inner_protocol(skb, skb->protocol); 6556 } 6557 6558 skb_push(skb, MPLS_HLEN); 6559 memmove(skb_mac_header(skb) - MPLS_HLEN, skb_mac_header(skb), 6560 mac_len); 6561 skb_reset_mac_header(skb); 6562 skb_set_network_header(skb, mac_len); 6563 skb_reset_mac_len(skb); 6564 6565 lse = mpls_hdr(skb); 6566 lse->label_stack_entry = mpls_lse; 6567 skb_postpush_rcsum(skb, lse, MPLS_HLEN); 6568 6569 if (ethernet && mac_len >= ETH_HLEN) 6570 skb_mod_eth_type(skb, eth_hdr(skb), mpls_proto); 6571 skb->protocol = mpls_proto; 6572 6573 return 0; 6574} 6575EXPORT_SYMBOL_GPL(skb_mpls_push); 6576 6577/** 6578 * skb_mpls_pop() - pop the outermost MPLS header 6579 * 6580 * @skb: buffer 6581 * @next_proto: ethertype of header after popped MPLS header 6582 * @mac_len: length of the MAC header 6583 * @ethernet: flag to indicate if the packet is ethernet 6584 * 6585 * Expects skb->data at mac header. 6586 * 6587 * Returns 0 on success, -errno otherwise. 6588 */ 6589int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto, int mac_len, 6590 bool ethernet) 6591{ 6592 int err; 6593 6594 if (unlikely(!eth_p_mpls(skb->protocol))) 6595 return 0; 6596 6597 err = skb_ensure_writable(skb, mac_len + MPLS_HLEN); 6598 if (unlikely(err)) 6599 return err; 6600 6601 skb_postpull_rcsum(skb, mpls_hdr(skb), MPLS_HLEN); 6602 memmove(skb_mac_header(skb) + MPLS_HLEN, skb_mac_header(skb), 6603 mac_len); 6604 6605 __skb_pull(skb, MPLS_HLEN); 6606 skb_reset_mac_header(skb); 6607 skb_set_network_header(skb, mac_len); 6608 6609 if (ethernet && mac_len >= ETH_HLEN) { 6610 struct ethhdr *hdr; 6611 6612 /* use mpls_hdr() to get ethertype to account for VLANs. */ 6613 hdr = (struct ethhdr *)((void *)mpls_hdr(skb) - ETH_HLEN); 6614 skb_mod_eth_type(skb, hdr, next_proto); 6615 } 6616 skb->protocol = next_proto; 6617 6618 return 0; 6619} 6620EXPORT_SYMBOL_GPL(skb_mpls_pop); 6621 6622/** 6623 * skb_mpls_update_lse() - modify outermost MPLS header and update csum 6624 * 6625 * @skb: buffer 6626 * @mpls_lse: new MPLS label stack entry to update to 6627 * 6628 * Expects skb->data at mac header. 6629 * 6630 * Returns 0 on success, -errno otherwise. 6631 */ 6632int skb_mpls_update_lse(struct sk_buff *skb, __be32 mpls_lse) 6633{ 6634 int err; 6635 6636 if (unlikely(!eth_p_mpls(skb->protocol))) 6637 return -EINVAL; 6638 6639 err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN); 6640 if (unlikely(err)) 6641 return err; 6642 6643 if (skb->ip_summed == CHECKSUM_COMPLETE) { 6644 __be32 diff[] = { ~mpls_hdr(skb)->label_stack_entry, mpls_lse }; 6645 6646 skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum); 6647 } 6648 6649 mpls_hdr(skb)->label_stack_entry = mpls_lse; 6650 6651 return 0; 6652} 6653EXPORT_SYMBOL_GPL(skb_mpls_update_lse); 6654 6655/** 6656 * skb_mpls_dec_ttl() - decrement the TTL of the outermost MPLS header 6657 * 6658 * @skb: buffer 6659 * 6660 * Expects skb->data at mac header. 6661 * 6662 * Returns 0 on success, -errno otherwise. 6663 */ 6664int skb_mpls_dec_ttl(struct sk_buff *skb) 6665{ 6666 u32 lse; 6667 u8 ttl; 6668 6669 if (unlikely(!eth_p_mpls(skb->protocol))) 6670 return -EINVAL; 6671 6672 if (!pskb_may_pull(skb, skb_network_offset(skb) + MPLS_HLEN)) 6673 return -ENOMEM; 6674 6675 lse = be32_to_cpu(mpls_hdr(skb)->label_stack_entry); 6676 ttl = (lse & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT; 6677 if (!--ttl) 6678 return -EINVAL; 6679 6680 lse &= ~MPLS_LS_TTL_MASK; 6681 lse |= ttl << MPLS_LS_TTL_SHIFT; 6682 6683 return skb_mpls_update_lse(skb, cpu_to_be32(lse)); 6684} 6685EXPORT_SYMBOL_GPL(skb_mpls_dec_ttl); 6686 6687/** 6688 * alloc_skb_with_frags - allocate skb with page frags 6689 * 6690 * @header_len: size of linear part 6691 * @data_len: needed length in frags 6692 * @order: max page order desired. 6693 * @errcode: pointer to error code if any 6694 * @gfp_mask: allocation mask 6695 * 6696 * This can be used to allocate a paged skb, given a maximal order for frags. 6697 */ 6698struct sk_buff *alloc_skb_with_frags(unsigned long header_len, 6699 unsigned long data_len, 6700 int order, 6701 int *errcode, 6702 gfp_t gfp_mask) 6703{ 6704 unsigned long chunk; 6705 struct sk_buff *skb; 6706 struct page *page; 6707 int nr_frags = 0; 6708 6709 *errcode = -EMSGSIZE; 6710 if (unlikely(data_len > MAX_SKB_FRAGS * (PAGE_SIZE << order))) 6711 return NULL; 6712 6713 *errcode = -ENOBUFS; 6714 skb = alloc_skb(header_len, gfp_mask); 6715 if (!skb) 6716 return NULL; 6717 6718 while (data_len) { 6719 if (nr_frags == MAX_SKB_FRAGS) 6720 goto failure; 6721 while (order && PAGE_ALIGN(data_len) < (PAGE_SIZE << order)) 6722 order--; 6723 6724 if (order) { 6725 page = alloc_pages((gfp_mask & ~__GFP_DIRECT_RECLAIM) | 6726 __GFP_COMP | 6727 __GFP_NOWARN, 6728 order); 6729 if (!page) { 6730 order--; 6731 continue; 6732 } 6733 } else { 6734 page = alloc_page(gfp_mask); 6735 if (!page) 6736 goto failure; 6737 } 6738 chunk = min_t(unsigned long, data_len, 6739 PAGE_SIZE << order); 6740 skb_fill_page_desc(skb, nr_frags, page, 0, chunk); 6741 nr_frags++; 6742 skb->truesize += (PAGE_SIZE << order); 6743 data_len -= chunk; 6744 } 6745 return skb; 6746 6747failure: 6748 kfree_skb(skb); 6749 return NULL; 6750} 6751EXPORT_SYMBOL(alloc_skb_with_frags); 6752 6753/* carve out the first off bytes from skb when off < headlen */ 6754static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off, 6755 const int headlen, gfp_t gfp_mask) 6756{ 6757 int i; 6758 unsigned int size = skb_end_offset(skb); 6759 int new_hlen = headlen - off; 6760 u8 *data; 6761 6762 if (skb_pfmemalloc(skb)) 6763 gfp_mask |= __GFP_MEMALLOC; 6764 6765 data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL); 6766 if (!data) 6767 return -ENOMEM; 6768 size = SKB_WITH_OVERHEAD(size); 6769 6770 /* Copy real data, and all frags */ 6771 skb_copy_from_linear_data_offset(skb, off, data, new_hlen); 6772 skb->len -= off; 6773 6774 memcpy((struct skb_shared_info *)(data + size), 6775 skb_shinfo(skb), 6776 offsetof(struct skb_shared_info, 6777 frags[skb_shinfo(skb)->nr_frags])); 6778 if (skb_cloned(skb)) { 6779 /* drop the old head gracefully */ 6780 if (skb_orphan_frags(skb, gfp_mask)) { 6781 skb_kfree_head(data, size); 6782 return -ENOMEM; 6783 } 6784 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) 6785 skb_frag_ref(skb, i); 6786 if (skb_has_frag_list(skb)) 6787 skb_clone_fraglist(skb); 6788 skb_release_data(skb, SKB_CONSUMED); 6789 } else { 6790 /* we can reuse existing recount- all we did was 6791 * relocate values 6792 */ 6793 skb_free_head(skb); 6794 } 6795 6796 skb->head = data; 6797 skb->data = data; 6798 skb->head_frag = 0; 6799 skb_set_end_offset(skb, size); 6800 skb_set_tail_pointer(skb, skb_headlen(skb)); 6801 skb_headers_offset_update(skb, 0); 6802 skb->cloned = 0; 6803 skb->hdr_len = 0; 6804 skb->nohdr = 0; 6805 atomic_set(&skb_shinfo(skb)->dataref, 1); 6806 6807 return 0; 6808} 6809 6810static int pskb_carve(struct sk_buff *skb, const u32 off, gfp_t gfp); 6811 6812/* carve out the first eat bytes from skb's frag_list. May recurse into 6813 * pskb_carve() 6814 */ 6815static int pskb_carve_frag_list(struct skb_shared_info *shinfo, int eat, 6816 gfp_t gfp_mask) 6817{ 6818 struct sk_buff *list = shinfo->frag_list; 6819 struct sk_buff *clone = NULL; 6820 struct sk_buff *insp = NULL; 6821 6822 do { 6823 if (!list) { 6824 pr_err("Not enough bytes to eat. Want %d\n", eat); 6825 return -EFAULT; 6826 } 6827 if (list->len <= eat) { 6828 /* Eaten as whole. */ 6829 eat -= list->len; 6830 list = list->next; 6831 insp = list; 6832 } else { 6833 /* Eaten partially. */ 6834 if (skb_shared(list)) { 6835 clone = skb_clone(list, gfp_mask); 6836 if (!clone) 6837 return -ENOMEM; 6838 insp = list->next; 6839 list = clone; 6840 } else { 6841 /* This may be pulled without problems. */ 6842 insp = list; 6843 } 6844 if (pskb_carve(list, eat, gfp_mask) < 0) { 6845 kfree_skb(clone); 6846 return -ENOMEM; 6847 } 6848 break; 6849 } 6850 } while (eat); 6851 6852 /* Free pulled out fragments. */ 6853 while ((list = shinfo->frag_list) != insp) { 6854 shinfo->frag_list = list->next; 6855 consume_skb(list); 6856 } 6857 /* And insert new clone at head. */ 6858 if (clone) { 6859 clone->next = list; 6860 shinfo->frag_list = clone; 6861 } 6862 return 0; 6863} 6864 6865/* carve off first len bytes from skb. Split line (off) is in the 6866 * non-linear part of skb 6867 */ 6868static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off, 6869 int pos, gfp_t gfp_mask) 6870{ 6871 int i, k = 0; 6872 unsigned int size = skb_end_offset(skb); 6873 u8 *data; 6874 const int nfrags = skb_shinfo(skb)->nr_frags; 6875 struct skb_shared_info *shinfo; 6876 6877 if (skb_pfmemalloc(skb)) 6878 gfp_mask |= __GFP_MEMALLOC; 6879 6880 data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL); 6881 if (!data) 6882 return -ENOMEM; 6883 size = SKB_WITH_OVERHEAD(size); 6884 6885 memcpy((struct skb_shared_info *)(data + size), 6886 skb_shinfo(skb), offsetof(struct skb_shared_info, frags[0])); 6887 if (skb_orphan_frags(skb, gfp_mask)) { 6888 skb_kfree_head(data, size); 6889 return -ENOMEM; 6890 } 6891 shinfo = (struct skb_shared_info *)(data + size); 6892 for (i = 0; i < nfrags; i++) { 6893 int fsize = skb_frag_size(&skb_shinfo(skb)->frags[i]); 6894 6895 if (pos + fsize > off) { 6896 shinfo->frags[k] = skb_shinfo(skb)->frags[i]; 6897 6898 if (pos < off) { 6899 /* Split frag. 6900 * We have two variants in this case: 6901 * 1. Move all the frag to the second 6902 * part, if it is possible. F.e. 6903 * this approach is mandatory for TUX, 6904 * where splitting is expensive. 6905 * 2. Split is accurately. We make this. 6906 */ 6907 skb_frag_off_add(&shinfo->frags[0], off - pos); 6908 skb_frag_size_sub(&shinfo->frags[0], off - pos); 6909 } 6910 skb_frag_ref(skb, i); 6911 k++; 6912 } 6913 pos += fsize; 6914 } 6915 shinfo->nr_frags = k; 6916 if (skb_has_frag_list(skb)) 6917 skb_clone_fraglist(skb); 6918 6919 /* split line is in frag list */ 6920 if (k == 0 && pskb_carve_frag_list(shinfo, off - pos, gfp_mask)) { 6921 /* skb_frag_unref() is not needed here as shinfo->nr_frags = 0. */ 6922 if (skb_has_frag_list(skb)) 6923 kfree_skb_list(skb_shinfo(skb)->frag_list); 6924 skb_kfree_head(data, size); 6925 return -ENOMEM; 6926 } 6927 skb_release_data(skb, SKB_CONSUMED); 6928 6929 skb->head = data; 6930 skb->head_frag = 0; 6931 skb->data = data; 6932 skb_set_end_offset(skb, size); 6933 skb_reset_tail_pointer(skb); 6934 skb_headers_offset_update(skb, 0); 6935 skb->cloned = 0; 6936 skb->hdr_len = 0; 6937 skb->nohdr = 0; 6938 skb->len -= off; 6939 skb->data_len = skb->len; 6940 atomic_set(&skb_shinfo(skb)->dataref, 1); 6941 return 0; 6942} 6943 6944/* remove len bytes from the beginning of the skb */ 6945static int pskb_carve(struct sk_buff *skb, const u32 len, gfp_t gfp) 6946{ 6947 int headlen = skb_headlen(skb); 6948 6949 if (len < headlen) 6950 return pskb_carve_inside_header(skb, len, headlen, gfp); 6951 else 6952 return pskb_carve_inside_nonlinear(skb, len, headlen, gfp); 6953} 6954 6955/* Extract to_copy bytes starting at off from skb, and return this in 6956 * a new skb 6957 */ 6958struct sk_buff *pskb_extract(struct sk_buff *skb, int off, 6959 int to_copy, gfp_t gfp) 6960{ 6961 struct sk_buff *clone = skb_clone(skb, gfp); 6962 6963 if (!clone) 6964 return NULL; 6965 6966 if (pskb_carve(clone, off, gfp) < 0 || 6967 pskb_trim(clone, to_copy)) { 6968 kfree_skb(clone); 6969 return NULL; 6970 } 6971 return clone; 6972} 6973EXPORT_SYMBOL(pskb_extract); 6974 6975/** 6976 * skb_condense - try to get rid of fragments/frag_list if possible 6977 * @skb: buffer 6978 * 6979 * Can be used to save memory before skb is added to a busy queue. 6980 * If packet has bytes in frags and enough tail room in skb->head, 6981 * pull all of them, so that we can free the frags right now and adjust 6982 * truesize. 6983 * Notes: 6984 * We do not reallocate skb->head thus can not fail. 6985 * Caller must re-evaluate skb->truesize if needed. 6986 */ 6987void skb_condense(struct sk_buff *skb) 6988{ 6989 if (skb->data_len) { 6990 if (skb->data_len > skb->end - skb->tail || 6991 skb_cloned(skb) || !skb_frags_readable(skb)) 6992 return; 6993 6994 /* Nice, we can free page frag(s) right now */ 6995 __pskb_pull_tail(skb, skb->data_len); 6996 } 6997 /* At this point, skb->truesize might be over estimated, 6998 * because skb had a fragment, and fragments do not tell 6999 * their truesize. 7000 * When we pulled its content into skb->head, fragment 7001 * was freed, but __pskb_pull_tail() could not possibly 7002 * adjust skb->truesize, not knowing the frag truesize. 7003 */ 7004 skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); 7005} 7006EXPORT_SYMBOL(skb_condense); 7007 7008#ifdef CONFIG_SKB_EXTENSIONS 7009static void *skb_ext_get_ptr(struct skb_ext *ext, enum skb_ext_id id) 7010{ 7011 return (void *)ext + (ext->offset[id] * SKB_EXT_ALIGN_VALUE); 7012} 7013 7014/** 7015 * __skb_ext_alloc - allocate a new skb extensions storage 7016 * 7017 * @flags: See kmalloc(). 7018 * 7019 * Returns the newly allocated pointer. The pointer can later attached to a 7020 * skb via __skb_ext_set(). 7021 * Note: caller must handle the skb_ext as an opaque data. 7022 */ 7023struct skb_ext *__skb_ext_alloc(gfp_t flags) 7024{ 7025 struct skb_ext *new = kmem_cache_alloc(skbuff_ext_cache, flags); 7026 7027 if (new) { 7028 memset(new->offset, 0, sizeof(new->offset)); 7029 refcount_set(&new->refcnt, 1); 7030 } 7031 7032 return new; 7033} 7034 7035static struct skb_ext *skb_ext_maybe_cow(struct skb_ext *old, 7036 unsigned int old_active) 7037{ 7038 struct skb_ext *new; 7039 7040 if (refcount_read(&old->refcnt) == 1) 7041 return old; 7042 7043 new = kmem_cache_alloc(skbuff_ext_cache, GFP_ATOMIC); 7044 if (!new) 7045 return NULL; 7046 7047 memcpy(new, old, old->chunks * SKB_EXT_ALIGN_VALUE); 7048 refcount_set(&new->refcnt, 1); 7049 7050#ifdef CONFIG_XFRM 7051 if (old_active & (1 << SKB_EXT_SEC_PATH)) { 7052 struct sec_path *sp = skb_ext_get_ptr(old, SKB_EXT_SEC_PATH); 7053 unsigned int i; 7054 7055 for (i = 0; i < sp->len; i++) 7056 xfrm_state_hold(sp->xvec[i]); 7057 } 7058#endif 7059#ifdef CONFIG_MCTP_FLOWS 7060 if (old_active & (1 << SKB_EXT_MCTP)) { 7061 struct mctp_flow *flow = skb_ext_get_ptr(old, SKB_EXT_MCTP); 7062 7063 if (flow->key) 7064 refcount_inc(&flow->key->refs); 7065 } 7066#endif 7067 __skb_ext_put(old); 7068 return new; 7069} 7070 7071/** 7072 * __skb_ext_set - attach the specified extension storage to this skb 7073 * @skb: buffer 7074 * @id: extension id 7075 * @ext: extension storage previously allocated via __skb_ext_alloc() 7076 * 7077 * Existing extensions, if any, are cleared. 7078 * 7079 * Returns the pointer to the extension. 7080 */ 7081void *__skb_ext_set(struct sk_buff *skb, enum skb_ext_id id, 7082 struct skb_ext *ext) 7083{ 7084 unsigned int newlen, newoff = SKB_EXT_CHUNKSIZEOF(*ext); 7085 7086 skb_ext_put(skb); 7087 newlen = newoff + skb_ext_type_len[id]; 7088 ext->chunks = newlen; 7089 ext->offset[id] = newoff; 7090 skb->extensions = ext; 7091 skb->active_extensions = 1 << id; 7092 return skb_ext_get_ptr(ext, id); 7093} 7094EXPORT_SYMBOL_NS_GPL(__skb_ext_set, "NETDEV_INTERNAL"); 7095 7096/** 7097 * skb_ext_add - allocate space for given extension, COW if needed 7098 * @skb: buffer 7099 * @id: extension to allocate space for 7100 * 7101 * Allocates enough space for the given extension. 7102 * If the extension is already present, a pointer to that extension 7103 * is returned. 7104 * 7105 * If the skb was cloned, COW applies and the returned memory can be 7106 * modified without changing the extension space of clones buffers. 7107 * 7108 * Returns pointer to the extension or NULL on allocation failure. 7109 */ 7110void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id) 7111{ 7112 struct skb_ext *new, *old = NULL; 7113 unsigned int newlen, newoff; 7114 7115 if (skb->active_extensions) { 7116 old = skb->extensions; 7117 7118 new = skb_ext_maybe_cow(old, skb->active_extensions); 7119 if (!new) 7120 return NULL; 7121 7122 if (__skb_ext_exist(new, id)) 7123 goto set_active; 7124 7125 newoff = new->chunks; 7126 } else { 7127 newoff = SKB_EXT_CHUNKSIZEOF(*new); 7128 7129 new = __skb_ext_alloc(GFP_ATOMIC); 7130 if (!new) 7131 return NULL; 7132 } 7133 7134 newlen = newoff + skb_ext_type_len[id]; 7135 new->chunks = newlen; 7136 new->offset[id] = newoff; 7137set_active: 7138 skb->slow_gro = 1; 7139 skb->extensions = new; 7140 skb->active_extensions |= 1 << id; 7141 return skb_ext_get_ptr(new, id); 7142} 7143EXPORT_SYMBOL(skb_ext_add); 7144 7145#ifdef CONFIG_XFRM 7146static void skb_ext_put_sp(struct sec_path *sp) 7147{ 7148 unsigned int i; 7149 7150 for (i = 0; i < sp->len; i++) 7151 xfrm_state_put(sp->xvec[i]); 7152} 7153#endif 7154 7155#ifdef CONFIG_MCTP_FLOWS 7156static void skb_ext_put_mctp(struct mctp_flow *flow) 7157{ 7158 if (flow->key) 7159 mctp_key_unref(flow->key); 7160} 7161#endif 7162 7163void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id) 7164{ 7165 struct skb_ext *ext = skb->extensions; 7166 7167 skb->active_extensions &= ~(1 << id); 7168 if (skb->active_extensions == 0) { 7169 skb->extensions = NULL; 7170 __skb_ext_put(ext); 7171#ifdef CONFIG_XFRM 7172 } else if (id == SKB_EXT_SEC_PATH && 7173 refcount_read(&ext->refcnt) == 1) { 7174 struct sec_path *sp = skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH); 7175 7176 skb_ext_put_sp(sp); 7177 sp->len = 0; 7178#endif 7179 } 7180} 7181EXPORT_SYMBOL(__skb_ext_del); 7182 7183void __skb_ext_put(struct skb_ext *ext) 7184{ 7185 /* If this is last clone, nothing can increment 7186 * it after check passes. Avoids one atomic op. 7187 */ 7188 if (refcount_read(&ext->refcnt) == 1) 7189 goto free_now; 7190 7191 if (!refcount_dec_and_test(&ext->refcnt)) 7192 return; 7193free_now: 7194#ifdef CONFIG_XFRM 7195 if (__skb_ext_exist(ext, SKB_EXT_SEC_PATH)) 7196 skb_ext_put_sp(skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH)); 7197#endif 7198#ifdef CONFIG_MCTP_FLOWS 7199 if (__skb_ext_exist(ext, SKB_EXT_MCTP)) 7200 skb_ext_put_mctp(skb_ext_get_ptr(ext, SKB_EXT_MCTP)); 7201#endif 7202 7203 kmem_cache_free(skbuff_ext_cache, ext); 7204} 7205EXPORT_SYMBOL(__skb_ext_put); 7206#endif /* CONFIG_SKB_EXTENSIONS */ 7207 7208static void kfree_skb_napi_cache(struct sk_buff *skb) 7209{ 7210 /* if SKB is a clone, don't handle this case */ 7211 if (skb->fclone != SKB_FCLONE_UNAVAILABLE) { 7212 __kfree_skb(skb); 7213 return; 7214 } 7215 7216 local_bh_disable(); 7217 __napi_kfree_skb(skb, SKB_CONSUMED); 7218 local_bh_enable(); 7219} 7220 7221/** 7222 * skb_attempt_defer_free - queue skb for remote freeing 7223 * @skb: buffer 7224 * 7225 * Put @skb in a per-cpu list, using the cpu which 7226 * allocated the skb/pages to reduce false sharing 7227 * and memory zone spinlock contention. 7228 */ 7229void skb_attempt_defer_free(struct sk_buff *skb) 7230{ 7231 struct skb_defer_node *sdn; 7232 unsigned long defer_count; 7233 int cpu = skb->alloc_cpu; 7234 unsigned int defer_max; 7235 bool kick; 7236 7237 if (cpu == raw_smp_processor_id() || 7238 WARN_ON_ONCE(cpu >= nr_cpu_ids) || 7239 !cpu_online(cpu)) { 7240nodefer: kfree_skb_napi_cache(skb); 7241 return; 7242 } 7243 7244 DEBUG_NET_WARN_ON_ONCE(skb_dst(skb)); 7245 DEBUG_NET_WARN_ON_ONCE(skb->destructor); 7246 DEBUG_NET_WARN_ON_ONCE(skb_nfct(skb)); 7247 7248 sdn = per_cpu_ptr(net_hotdata.skb_defer_nodes, cpu) + numa_node_id(); 7249 7250 defer_max = READ_ONCE(net_hotdata.sysctl_skb_defer_max); 7251 defer_count = atomic_long_inc_return(&sdn->defer_count); 7252 7253 if (defer_count >= defer_max) 7254 goto nodefer; 7255 7256 llist_add(&skb->ll_node, &sdn->defer_list); 7257 7258 /* Send an IPI every time queue reaches half capacity. */ 7259 kick = (defer_count - 1) == (defer_max >> 1); 7260 7261 /* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU 7262 * if we are unlucky enough (this seems very unlikely). 7263 */ 7264 if (unlikely(kick)) 7265 kick_defer_list_purge(cpu); 7266} 7267 7268static void skb_splice_csum_page(struct sk_buff *skb, struct page *page, 7269 size_t offset, size_t len) 7270{ 7271 const char *kaddr; 7272 __wsum csum; 7273 7274 kaddr = kmap_local_page(page); 7275 csum = csum_partial(kaddr + offset, len, 0); 7276 kunmap_local(kaddr); 7277 skb->csum = csum_block_add(skb->csum, csum, skb->len); 7278} 7279 7280/** 7281 * skb_splice_from_iter - Splice (or copy) pages to skbuff 7282 * @skb: The buffer to add pages to 7283 * @iter: Iterator representing the pages to be added 7284 * @maxsize: Maximum amount of pages to be added 7285 * 7286 * This is a common helper function for supporting MSG_SPLICE_PAGES. It 7287 * extracts pages from an iterator and adds them to the socket buffer if 7288 * possible, copying them to fragments if not possible (such as if they're slab 7289 * pages). 7290 * 7291 * Returns the amount of data spliced/copied or -EMSGSIZE if there's 7292 * insufficient space in the buffer to transfer anything. 7293 */ 7294ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter, 7295 ssize_t maxsize) 7296{ 7297 size_t frag_limit = READ_ONCE(net_hotdata.sysctl_max_skb_frags); 7298 struct page *pages[8], **ppages = pages; 7299 ssize_t spliced = 0, ret = 0; 7300 unsigned int i; 7301 7302 while (iter->count > 0) { 7303 ssize_t space, nr, len; 7304 size_t off; 7305 7306 ret = -EMSGSIZE; 7307 space = frag_limit - skb_shinfo(skb)->nr_frags; 7308 if (space < 0) 7309 break; 7310 7311 /* We might be able to coalesce without increasing nr_frags */ 7312 nr = clamp_t(size_t, space, 1, ARRAY_SIZE(pages)); 7313 7314 len = iov_iter_extract_pages(iter, &ppages, maxsize, nr, 0, &off); 7315 if (len <= 0) { 7316 ret = len ?: -EIO; 7317 break; 7318 } 7319 7320 i = 0; 7321 do { 7322 struct page *page = pages[i++]; 7323 size_t part = min_t(size_t, PAGE_SIZE - off, len); 7324 7325 ret = -EIO; 7326 if (WARN_ON_ONCE(!sendpage_ok(page))) 7327 goto out; 7328 7329 ret = skb_append_pagefrags(skb, page, off, part, 7330 frag_limit); 7331 if (ret < 0) { 7332 iov_iter_revert(iter, len); 7333 goto out; 7334 } 7335 7336 if (skb->ip_summed == CHECKSUM_NONE) 7337 skb_splice_csum_page(skb, page, off, part); 7338 7339 off = 0; 7340 spliced += part; 7341 maxsize -= part; 7342 len -= part; 7343 } while (len > 0); 7344 7345 if (maxsize <= 0) 7346 break; 7347 } 7348 7349out: 7350 skb_len_add(skb, spliced); 7351 return spliced ?: ret; 7352} 7353EXPORT_SYMBOL(skb_splice_from_iter); 7354 7355static __always_inline 7356size_t memcpy_from_iter_csum(void *iter_from, size_t progress, 7357 size_t len, void *to, void *priv2) 7358{ 7359 __wsum *csum = priv2; 7360 __wsum next = csum_partial_copy_nocheck(iter_from, to + progress, len); 7361 7362 *csum = csum_block_add(*csum, next, progress); 7363 return 0; 7364} 7365 7366static __always_inline 7367size_t copy_from_user_iter_csum(void __user *iter_from, size_t progress, 7368 size_t len, void *to, void *priv2) 7369{ 7370 __wsum next, *csum = priv2; 7371 7372 next = csum_and_copy_from_user(iter_from, to + progress, len); 7373 *csum = csum_block_add(*csum, next, progress); 7374 return next ? 0 : len; 7375} 7376 7377bool csum_and_copy_from_iter_full(void *addr, size_t bytes, 7378 __wsum *csum, struct iov_iter *i) 7379{ 7380 size_t copied; 7381 7382 if (WARN_ON_ONCE(!i->data_source)) 7383 return false; 7384 copied = iterate_and_advance2(i, bytes, addr, csum, 7385 copy_from_user_iter_csum, 7386 memcpy_from_iter_csum); 7387 if (likely(copied == bytes)) 7388 return true; 7389 iov_iter_revert(i, copied); 7390 return false; 7391} 7392EXPORT_SYMBOL(csum_and_copy_from_iter_full); 7393 7394void get_netmem(netmem_ref netmem) 7395{ 7396 struct net_iov *niov; 7397 7398 if (netmem_is_net_iov(netmem)) { 7399 niov = netmem_to_net_iov(netmem); 7400 if (net_is_devmem_iov(niov)) 7401 net_devmem_get_net_iov(netmem_to_net_iov(netmem)); 7402 return; 7403 } 7404 get_page(netmem_to_page(netmem)); 7405} 7406EXPORT_SYMBOL(get_netmem); 7407 7408void put_netmem(netmem_ref netmem) 7409{ 7410 struct net_iov *niov; 7411 7412 if (netmem_is_net_iov(netmem)) { 7413 niov = netmem_to_net_iov(netmem); 7414 if (net_is_devmem_iov(niov)) 7415 net_devmem_put_net_iov(netmem_to_net_iov(netmem)); 7416 return; 7417 } 7418 7419 put_page(netmem_to_page(netmem)); 7420} 7421EXPORT_SYMBOL(put_netmem);