Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

net: reorganize sk_buff for faster __copy_skb_header()

With proliferation of bit fields in sk_buff, __copy_skb_header() became
quite expensive, showing as the most expensive function in a GSO
workload.

__copy_skb_header() performance is also critical for non GSO TCP
operations, as it is used from skb_clone()

This patch carefully moves all the fields that were not copied in a
separate zone : cloned, nohdr, fclone, peeked, head_frag, xmit_more

Then I moved all other fields and all other copied fields in a section
delimited by headers_start[0]/headers_end[0] section so that we
can use a single memcpy() call, inlined by compiler using long
word load/stores.

I also tried to make all copies in the natural orders of sk_buff,
to help hardware prefetching.

I made sure sk_buff size did not change.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

Eric Dumazet and committed by
David S. Miller
b1937227 842abe08

+118 -105
+75 -64
include/linux/skbuff.h
··· 527 527 char cb[48] __aligned(8); 528 528 529 529 unsigned long _skb_refdst; 530 + void (*destructor)(struct sk_buff *skb); 530 531 #ifdef CONFIG_XFRM 531 532 struct sec_path *sp; 533 + #endif 534 + #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) 535 + struct nf_conntrack *nfct; 536 + #endif 537 + #ifdef CONFIG_BRIDGE_NETFILTER 538 + struct nf_bridge_info *nf_bridge; 532 539 #endif 533 540 unsigned int len, 534 541 data_len; 535 542 __u16 mac_len, 536 543 hdr_len; 537 - union { 538 - __wsum csum; 539 - struct { 540 - __u16 csum_start; 541 - __u16 csum_offset; 542 - }; 543 - }; 544 - __u32 priority; 544 + 545 + /* Following fields are _not_ copied in __copy_skb_header() 546 + * Note that queue_mapping is here mostly to fill a hole. 547 + */ 545 548 kmemcheck_bitfield_begin(flags1); 546 - __u8 ignore_df:1, 547 - cloned:1, 548 - ip_summed:2, 549 + __u16 queue_mapping; 550 + __u8 cloned:1, 549 551 nohdr:1, 550 - nfctinfo:3; 552 + fclone:2, 553 + peeked:1, 554 + head_frag:1, 555 + xmit_more:1; 556 + /* one bit hole */ 557 + kmemcheck_bitfield_end(flags1); 558 + 559 + 560 + 561 + /* fields enclosed in headers_start/headers_end are copied 562 + * using a single memcpy() in __copy_skb_header() 563 + */ 564 + __u32 headers_start[0]; 551 565 552 566 /* if you move pkt_type around you also must adapt those constants */ 553 567 #ifdef __BIG_ENDIAN_BITFIELD ··· 572 558 #define PKT_TYPE_OFFSET() offsetof(struct sk_buff, __pkt_type_offset) 573 559 574 560 __u8 __pkt_type_offset[0]; 575 - __u8 pkt_type:3, 576 - fclone:2, 577 - ipvs_property:1, 578 - peeked:1, 579 - nf_trace:1; 580 - kmemcheck_bitfield_end(flags1); 581 - __be16 protocol; 561 + __u8 pkt_type:3; 562 + __u8 pfmemalloc:1; 563 + __u8 ignore_df:1; 564 + __u8 nfctinfo:3; 582 565 583 - void (*destructor)(struct sk_buff *skb); 584 - #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) 585 - struct nf_conntrack *nfct; 566 + __u8 nf_trace:1; 567 + __u8 ip_summed:2; 568 + __u8 ooo_okay:1; 569 + __u8 l4_hash:1; 570 + __u8 sw_hash:1; 571 + __u8 wifi_acked_valid:1; 572 + __u8 wifi_acked:1; 573 + 574 + __u8 no_fcs:1; 575 + /* Indicates the inner headers are valid in the skbuff. */ 576 + __u8 encapsulation:1; 577 + __u8 encap_hdr_csum:1; 578 + __u8 csum_valid:1; 579 + __u8 csum_complete_sw:1; 580 + __u8 csum_level:2; 581 + __u8 csum_bad:1; 582 + 583 + #ifdef CONFIG_IPV6_NDISC_NODETYPE 584 + __u8 ndisc_nodetype:2; 586 585 #endif 587 - #ifdef CONFIG_BRIDGE_NETFILTER 588 - struct nf_bridge_info *nf_bridge; 589 - #endif 590 - 591 - int skb_iif; 592 - 593 - __u32 hash; 594 - 595 - __be16 vlan_proto; 596 - __u16 vlan_tci; 586 + __u8 ipvs_property:1; 587 + /* 5 or 7 bit hole */ 597 588 598 589 #ifdef CONFIG_NET_SCHED 599 590 __u16 tc_index; /* traffic control index */ ··· 607 588 #endif 608 589 #endif 609 590 610 - __u16 queue_mapping; 611 - kmemcheck_bitfield_begin(flags2); 612 - __u8 xmit_more:1; 613 - #ifdef CONFIG_IPV6_NDISC_NODETYPE 614 - __u8 ndisc_nodetype:2; 615 - #endif 616 - __u8 pfmemalloc:1; 617 - __u8 ooo_okay:1; 618 - __u8 l4_hash:1; 619 - __u8 sw_hash:1; 620 - __u8 wifi_acked_valid:1; 621 - __u8 wifi_acked:1; 622 - __u8 no_fcs:1; 623 - __u8 head_frag:1; 624 - /* Indicates the inner headers are valid in the skbuff. */ 625 - __u8 encapsulation:1; 626 - __u8 encap_hdr_csum:1; 627 - __u8 csum_valid:1; 628 - __u8 csum_complete_sw:1; 629 - /* 1/3 bit hole (depending on ndisc_nodetype presence) */ 630 - kmemcheck_bitfield_end(flags2); 631 - 591 + union { 592 + __wsum csum; 593 + struct { 594 + __u16 csum_start; 595 + __u16 csum_offset; 596 + }; 597 + }; 598 + __u32 priority; 599 + int skb_iif; 600 + __u32 hash; 601 + __be16 vlan_proto; 602 + __u16 vlan_tci; 632 603 #if defined CONFIG_NET_DMA || defined CONFIG_NET_RX_BUSY_POLL 633 604 union { 634 605 unsigned int napi_id; ··· 634 625 __u32 reserved_tailroom; 635 626 }; 636 627 637 - kmemcheck_bitfield_begin(flags3); 638 - __u8 csum_level:2; 639 - __u8 csum_bad:1; 640 - /* 13 bit hole */ 641 - kmemcheck_bitfield_end(flags3); 642 - 643 628 __be16 inner_protocol; 644 629 __u16 inner_transport_header; 645 630 __u16 inner_network_header; 646 631 __u16 inner_mac_header; 632 + 633 + __be16 protocol; 647 634 __u16 transport_header; 648 635 __u16 network_header; 649 636 __u16 mac_header; 637 + 638 + __u32 headers_end[0]; 639 + 650 640 /* These elements must be at the end, see alloc_skb() for details. */ 651 641 sk_buff_data_t tail; 652 642 sk_buff_data_t end; ··· 3048 3040 } 3049 3041 3050 3042 /* Note: This doesn't put any conntrack and bridge info in dst. */ 3051 - static inline void __nf_copy(struct sk_buff *dst, const struct sk_buff *src) 3043 + static inline void __nf_copy(struct sk_buff *dst, const struct sk_buff *src, 3044 + bool copy) 3052 3045 { 3053 3046 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) 3054 3047 dst->nfct = src->nfct; 3055 3048 nf_conntrack_get(src->nfct); 3056 - dst->nfctinfo = src->nfctinfo; 3049 + if (copy) 3050 + dst->nfctinfo = src->nfctinfo; 3057 3051 #endif 3058 3052 #ifdef CONFIG_BRIDGE_NETFILTER 3059 3053 dst->nf_bridge = src->nf_bridge; 3060 3054 nf_bridge_get(src->nf_bridge); 3061 3055 #endif 3062 3056 #if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || defined(CONFIG_NF_TABLES) 3063 - dst->nf_trace = src->nf_trace; 3057 + if (copy) 3058 + dst->nf_trace = src->nf_trace; 3064 3059 #endif 3065 3060 } 3066 3061 ··· 3075 3064 #ifdef CONFIG_BRIDGE_NETFILTER 3076 3065 nf_bridge_put(dst->nf_bridge); 3077 3066 #endif 3078 - __nf_copy(dst, src); 3067 + __nf_copy(dst, src, true); 3079 3068 } 3080 3069 3081 3070 #ifdef CONFIG_NETWORK_SECMARK
+43 -41
net/core/skbuff.c
··· 261 261 atomic_t *fclone_ref = (atomic_t *) (child + 1); 262 262 263 263 kmemcheck_annotate_bitfield(child, flags1); 264 - kmemcheck_annotate_bitfield(child, flags2); 265 264 skb->fclone = SKB_FCLONE_ORIG; 266 265 atomic_set(fclone_ref, 1); 267 266 ··· 674 675 } 675 676 EXPORT_SYMBOL(consume_skb); 676 677 678 + /* Make sure a field is enclosed inside headers_start/headers_end section */ 679 + #define CHECK_SKB_FIELD(field) \ 680 + BUILD_BUG_ON(offsetof(struct sk_buff, field) < \ 681 + offsetof(struct sk_buff, headers_start)); \ 682 + BUILD_BUG_ON(offsetof(struct sk_buff, field) > \ 683 + offsetof(struct sk_buff, headers_end)); \ 684 + 677 685 static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) 678 686 { 679 687 new->tstamp = old->tstamp; 688 + /* We do not copy old->sk */ 680 689 new->dev = old->dev; 681 - new->transport_header = old->transport_header; 682 - new->network_header = old->network_header; 683 - new->mac_header = old->mac_header; 684 - new->inner_protocol = old->inner_protocol; 685 - new->inner_transport_header = old->inner_transport_header; 686 - new->inner_network_header = old->inner_network_header; 687 - new->inner_mac_header = old->inner_mac_header; 690 + memcpy(new->cb, old->cb, sizeof(old->cb)); 688 691 skb_dst_copy(new, old); 689 - skb_copy_hash(new, old); 690 - new->ooo_okay = old->ooo_okay; 691 - new->no_fcs = old->no_fcs; 692 - new->encapsulation = old->encapsulation; 693 - new->encap_hdr_csum = old->encap_hdr_csum; 694 - new->csum_valid = old->csum_valid; 695 - new->csum_complete_sw = old->csum_complete_sw; 696 692 #ifdef CONFIG_XFRM 697 693 new->sp = secpath_get(old->sp); 698 694 #endif 699 - memcpy(new->cb, old->cb, sizeof(old->cb)); 700 - new->csum = old->csum; 701 - new->ignore_df = old->ignore_df; 702 - new->pkt_type = old->pkt_type; 703 - new->ip_summed = old->ip_summed; 704 - skb_copy_queue_mapping(new, old); 705 - new->priority = old->priority; 706 - #if IS_ENABLED(CONFIG_IP_VS) 707 - new->ipvs_property = old->ipvs_property; 708 - #endif 709 - new->pfmemalloc = old->pfmemalloc; 710 - new->protocol = old->protocol; 711 - new->mark = old->mark; 712 - new->skb_iif = old->skb_iif; 713 - __nf_copy(new, old); 714 - #ifdef CONFIG_NET_SCHED 715 - new->tc_index = old->tc_index; 716 - #ifdef CONFIG_NET_CLS_ACT 717 - new->tc_verd = old->tc_verd; 718 - #endif 719 - #endif 720 - new->vlan_proto = old->vlan_proto; 721 - new->vlan_tci = old->vlan_tci; 695 + __nf_copy(new, old, false); 722 696 723 - skb_copy_secmark(new, old); 697 + /* Note : this field could be in headers_start/headers_end section 698 + * It is not yet because we do not want to have a 16 bit hole 699 + */ 700 + new->queue_mapping = old->queue_mapping; 724 701 702 + memcpy(&new->headers_start, &old->headers_start, 703 + offsetof(struct sk_buff, headers_end) - 704 + offsetof(struct sk_buff, headers_start)); 705 + CHECK_SKB_FIELD(protocol); 706 + CHECK_SKB_FIELD(csum); 707 + CHECK_SKB_FIELD(hash); 708 + CHECK_SKB_FIELD(priority); 709 + CHECK_SKB_FIELD(skb_iif); 710 + CHECK_SKB_FIELD(vlan_proto); 711 + CHECK_SKB_FIELD(vlan_tci); 712 + CHECK_SKB_FIELD(transport_header); 713 + CHECK_SKB_FIELD(network_header); 714 + CHECK_SKB_FIELD(mac_header); 715 + CHECK_SKB_FIELD(inner_protocol); 716 + CHECK_SKB_FIELD(inner_transport_header); 717 + CHECK_SKB_FIELD(inner_network_header); 718 + CHECK_SKB_FIELD(inner_mac_header); 719 + CHECK_SKB_FIELD(mark); 720 + #ifdef CONFIG_NETWORK_SECMARK 721 + CHECK_SKB_FIELD(secmark); 722 + #endif 725 723 #ifdef CONFIG_NET_RX_BUSY_POLL 726 - new->napi_id = old->napi_id; 724 + CHECK_SKB_FIELD(napi_id); 727 725 #endif 726 + #ifdef CONFIG_NET_SCHED 727 + CHECK_SKB_FIELD(tc_index); 728 + #ifdef CONFIG_NET_CLS_ACT 729 + CHECK_SKB_FIELD(tc_verd); 730 + #endif 731 + #endif 732 + 728 733 } 729 734 730 735 /* ··· 879 876 return NULL; 880 877 881 878 kmemcheck_annotate_bitfield(n, flags1); 882 - kmemcheck_annotate_bitfield(n, flags2); 883 879 n->fclone = SKB_FCLONE_UNAVAILABLE; 884 880 } 885 881