Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

net: more accurate skb truesize

skb truesize currently accounts for sk_buff struct and part of skb head.
kmalloc() roundings are also ignored.

Considering that skb_shared_info is larger than sk_buff, its time to
take it into account for better memory accounting.

This patch introduces SKB_TRUESIZE(X) macro to centralize various
assumptions into a single place.

At skb alloc phase, we put skb_shared_info struct at the exact end of
skb head, to allow a better use of memory (lowering number of
reallocations), since kmalloc() gives us power-of-two memory blocks.

Unless SLUB/SLUB debug is active, both skb->head and skb_shared_info are
aligned to cache lines, as before.

Note: This patch might trigger performance regressions because of
misconfigured protocol stacks, hitting per socket or global memory
limits that were previously not reached. But its a necessary step for a
more accurate memory accounting.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
CC: Andi Kleen <ak@linux.intel.com>
CC: Ben Hutchings <bhutchings@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

Eric Dumazet and committed by
David S. Miller
87fb4b7b 97ba0eb6

+32 -19
+5
include/linux/skbuff.h
··· 46 46 #define SKB_MAX_HEAD(X) (SKB_MAX_ORDER((X), 0)) 47 47 #define SKB_MAX_ALLOC (SKB_MAX_ORDER(0, 2)) 48 48 49 + /* return minimum truesize of one skb containing X bytes of data */ 50 + #define SKB_TRUESIZE(X) ((X) + \ 51 + SKB_DATA_ALIGN(sizeof(struct sk_buff)) + \ 52 + SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) 53 + 49 54 /* A. Checksumming of received packets by device. 50 55 * 51 56 * NONE: device failed to checksum this packet.
+14 -4
net/core/skbuff.c
··· 184 184 goto out; 185 185 prefetchw(skb); 186 186 187 - size = SKB_DATA_ALIGN(size); 188 - data = kmalloc_node_track_caller(size + sizeof(struct skb_shared_info), 189 - gfp_mask, node); 187 + /* We do our best to align skb_shared_info on a separate cache 188 + * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives 189 + * aligned memory blocks, unless SLUB/SLAB debug is enabled. 190 + * Both skb->head and skb_shared_info are cache line aligned. 191 + */ 192 + size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 193 + data = kmalloc_node_track_caller(size, gfp_mask, node); 190 194 if (!data) 191 195 goto nodata; 196 + /* kmalloc(size) might give us more room than requested. 197 + * Put skb_shared_info exactly at the end of allocated zone, 198 + * to allow max possible filling before reallocation. 199 + */ 200 + size = SKB_WITH_OVERHEAD(ksize(data)); 192 201 prefetchw(data + size); 193 202 194 203 /* ··· 206 197 * the tail pointer in struct sk_buff! 207 198 */ 208 199 memset(skb, 0, offsetof(struct sk_buff, tail)); 209 - skb->truesize = size + sizeof(struct sk_buff); 200 + /* Account for allocated memory : skb + skb->head */ 201 + skb->truesize = SKB_TRUESIZE(size); 210 202 atomic_set(&skb->users, 1); 211 203 skb->head = data; 212 204 skb->data = data;
+1 -1
net/core/sock.c
··· 207 207 * not depend upon such differences. 208 208 */ 209 209 #define _SK_MEM_PACKETS 256 210 - #define _SK_MEM_OVERHEAD (sizeof(struct sk_buff) + 256) 210 + #define _SK_MEM_OVERHEAD SKB_TRUESIZE(256) 211 211 #define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) 212 212 #define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) 213 213
+2 -3
net/ipv4/icmp.c
··· 1152 1152 net->ipv4.icmp_sk[i] = sk; 1153 1153 1154 1154 /* Enough space for 2 64K ICMP packets, including 1155 - * sk_buff struct overhead. 1155 + * sk_buff/skb_shared_info struct overhead. 1156 1156 */ 1157 - sk->sk_sndbuf = 1158 - (2 * ((64 * 1024) + sizeof(struct sk_buff))); 1157 + sk->sk_sndbuf = 2 * SKB_TRUESIZE(64 * 1024); 1159 1158 1160 1159 /* 1161 1160 * Speedup sock_wfree()
+7 -7
net/ipv4/tcp_input.c
··· 265 265 266 266 static void tcp_fixup_sndbuf(struct sock *sk) 267 267 { 268 - int sndmem = tcp_sk(sk)->rx_opt.mss_clamp + MAX_TCP_HEADER + 16 + 269 - sizeof(struct sk_buff); 268 + int sndmem = SKB_TRUESIZE(tcp_sk(sk)->rx_opt.mss_clamp + MAX_TCP_HEADER); 270 269 271 270 if (sk->sk_sndbuf < 3 * sndmem) { 272 271 sk->sk_sndbuf = 3 * sndmem; ··· 348 349 static void tcp_fixup_rcvbuf(struct sock *sk) 349 350 { 350 351 struct tcp_sock *tp = tcp_sk(sk); 351 - int rcvmem = tp->advmss + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff); 352 + int rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER); 352 353 353 354 /* Try to select rcvbuf so that 4 mss-sized segments 354 355 * will fit to window and corresponding skbs will fit to our rcvbuf. ··· 539 540 space /= tp->advmss; 540 541 if (!space) 541 542 space = 1; 542 - rcvmem = (tp->advmss + MAX_TCP_HEADER + 543 - 16 + sizeof(struct sk_buff)); 543 + rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER); 544 544 while (tcp_win_from_space(rcvmem) < tp->advmss) 545 545 rcvmem += 128; 546 546 space *= rcvmem; ··· 4948 4950 struct tcp_sock *tp = tcp_sk(sk); 4949 4951 4950 4952 if (tcp_should_expand_sndbuf(sk)) { 4951 - int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) + 4952 - MAX_TCP_HEADER + 16 + sizeof(struct sk_buff); 4953 + int sndmem = SKB_TRUESIZE(max_t(u32, 4954 + tp->rx_opt.mss_clamp, 4955 + tp->mss_cache) + 4956 + MAX_TCP_HEADER); 4953 4957 int demanded = max_t(unsigned int, tp->snd_cwnd, 4954 4958 tp->reordering + 1); 4955 4959 sndmem *= 2 * demanded;
+1 -2
net/ipv6/icmp.c
··· 835 835 /* Enough space for 2 64K ICMP packets, including 836 836 * sk_buff struct overhead. 837 837 */ 838 - sk->sk_sndbuf = 839 - (2 * ((64 * 1024) + sizeof(struct sk_buff))); 838 + sk->sk_sndbuf = 2 * SKB_TRUESIZE(64 * 1024); 840 839 } 841 840 return 0; 842 841
+1 -1
net/iucv/af_iucv.c
··· 1819 1819 goto save_message; 1820 1820 1821 1821 len = atomic_read(&sk->sk_rmem_alloc); 1822 - len += iucv_msg_length(msg) + sizeof(struct sk_buff); 1822 + len += SKB_TRUESIZE(iucv_msg_length(msg)); 1823 1823 if (len > sk->sk_rcvbuf) 1824 1824 goto save_message; 1825 1825
+1 -1
net/sctp/protocol.c
··· 1299 1299 max_share = min(4UL*1024*1024, limit); 1300 1300 1301 1301 sysctl_sctp_rmem[0] = SK_MEM_QUANTUM; /* give each asoc 1 page min */ 1302 - sysctl_sctp_rmem[1] = (1500 *(sizeof(struct sk_buff) + 1)); 1302 + sysctl_sctp_rmem[1] = 1500 * SKB_TRUESIZE(1); 1303 1303 sysctl_sctp_rmem[2] = max(sysctl_sctp_rmem[1], max_share); 1304 1304 1305 1305 sysctl_sctp_wmem[0] = SK_MEM_QUANTUM;