Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

net: gro: allow to build full sized skb

skb_gro_receive() is currently limited to 16 or 17 MSS per GRO skb,
typically 24616 bytes, because it fills up to MAX_SKB_FRAGS frags.

It's relatively easy to extend the skb using frag_list to allow
more frags to be appended into the last sk_buff.

This still builds very efficient skbs, and allows reaching 45 MSS per
skb.

(45 MSS GRO packet uses one skb plus a frag_list containing 2 additional
sk_buff)

High speed TCP flows benefit from this extension by lowering TCP stack
cpu usage (less packets stored in receive queue, less ACK packets
processed)

Forwarding setups could be hurt, as such skbs will need to be
linearized, although its not a new problem, as GRO could already
provide skbs with a frag_list.

We could make the 65536 bytes threshold a tunable to mitigate this.

(First time we need to linearize skb in skb_needs_linearize(), we could
lower the tunable to ~16*1460 so that following skb_gro_receive() calls
build smaller skbs)

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

Eric Dumazet and committed by
David S. Miller
8a29111c 4c60f1d6

+26 -17
+26 -17
net/core/skbuff.c
··· 2936 2936 2937 2937 int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) 2938 2938 { 2939 - struct sk_buff *p = *head; 2940 - struct sk_buff *nskb; 2941 - struct skb_shared_info *skbinfo = skb_shinfo(skb); 2942 - struct skb_shared_info *pinfo = skb_shinfo(p); 2943 - unsigned int headroom; 2944 - unsigned int len = skb_gro_len(skb); 2939 + struct skb_shared_info *pinfo, *skbinfo = skb_shinfo(skb); 2945 2940 unsigned int offset = skb_gro_offset(skb); 2946 2941 unsigned int headlen = skb_headlen(skb); 2942 + struct sk_buff *nskb, *lp, *p = *head; 2943 + unsigned int len = skb_gro_len(skb); 2947 2944 unsigned int delta_truesize; 2945 + unsigned int headroom; 2948 2946 2949 - if (p->len + len >= 65536) 2947 + if (unlikely(p->len + len >= 65536)) 2950 2948 return -E2BIG; 2951 2949 2952 - if (pinfo->frag_list) 2953 - goto merge; 2954 - else if (headlen <= offset) { 2950 + lp = NAPI_GRO_CB(p)->last ?: p; 2951 + pinfo = skb_shinfo(lp); 2952 + 2953 + if (headlen <= offset) { 2955 2954 skb_frag_t *frag; 2956 2955 skb_frag_t *frag2; 2957 2956 int i = skbinfo->nr_frags; 2958 2957 int nr_frags = pinfo->nr_frags + i; 2959 2958 2960 - offset -= headlen; 2961 - 2962 2959 if (nr_frags > MAX_SKB_FRAGS) 2963 - return -E2BIG; 2960 + goto merge; 2964 2961 2962 + offset -= headlen; 2965 2963 pinfo->nr_frags = nr_frags; 2966 2964 skbinfo->nr_frags = 0; 2967 2965 ··· 2990 2992 unsigned int first_offset; 2991 2993 2992 2994 if (nr_frags + 1 + skbinfo->nr_frags > MAX_SKB_FRAGS) 2993 - return -E2BIG; 2995 + goto merge; 2994 2996 2995 2997 first_offset = skb->data - 2996 2998 (unsigned char *)page_address(page) + ··· 3008 3010 delta_truesize = skb->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff)); 3009 3011 NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD; 3010 3012 goto done; 3011 - } else if (skb_gro_len(p) != pinfo->gso_size) 3013 + } 3014 + if (pinfo->frag_list) 3015 + goto merge; 3016 + if (skb_gro_len(p) != pinfo->gso_size) 3012 3017 return -E2BIG; 3013 3018 3014 3019 headroom = skb_headroom(p); ··· 3063 3062 3064 3063 __skb_pull(skb, offset); 3065 3064 3066 - NAPI_GRO_CB(p)->last->next = skb; 3065 + if (!NAPI_GRO_CB(p)->last) 3066 + skb_shinfo(p)->frag_list = skb; 3067 + else 3068 + NAPI_GRO_CB(p)->last->next = skb; 3067 3069 NAPI_GRO_CB(p)->last = skb; 3068 3070 skb_header_release(skb); 3071 + lp = p; 3069 3072 3070 3073 done: 3071 3074 NAPI_GRO_CB(p)->count++; 3072 3075 p->data_len += len; 3073 3076 p->truesize += delta_truesize; 3074 3077 p->len += len; 3075 - 3078 + if (lp != p) { 3079 + lp->data_len += len; 3080 + lp->truesize += delta_truesize; 3081 + lp->len += len; 3082 + } 3076 3083 NAPI_GRO_CB(skb)->same_flow = 1; 3077 3084 return 0; 3078 3085 }