Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'udp_tunnel-gro-optimizations'

Paolo Abeni says:

====================
udp_tunnel: GRO optimizations

The UDP tunnel GRO stage is source of measurable overhead for workload
based on UDP-encapsulated traffic: each incoming packets requires a full
UDP socket lookup and an indirect call.

In the most common setups a single UDP tunnel device is used. In such
case we can optimize both the lookup and the indirect call.

Patch 1 tracks per netns the active UDP tunnels and replaces the socket
lookup with a single destination port comparison when possible.

Patch 2 tracks the different types of UDP tunnels and replaces the
indirect call with a static one when there is a single UDP tunnel type
active.

I measure ~5% performance improvement in TCP over UDP tunnel stream
tests on top of this series.

v3: https://lore.kernel.org/netdev/cover.1741632298.git.pabeni@redhat.com/
v2: https://lore.kernel.org/netdev/cover.1741338765.git.pabeni@redhat.com/
v1: https://lore.kernel.org/netdev/cover.1741275846.git.pabeni@redhat.com/
====================

Link: https://patch.msgid.link/cover.1741718157.git.pabeni@redhat.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>

+249 -2
+16
include/linux/udp.h
··· 101 101 102 102 /* Cache friendly copy of sk->sk_peek_off >= 0 */ 103 103 bool peeking_with_offset; 104 + 105 + /* 106 + * Accounting for the tunnel GRO fastpath. 107 + * Unprotected by compilers guard, as it uses space available in 108 + * the last UDP socket cacheline. 109 + */ 110 + struct hlist_node tunnel_list; 104 111 }; 105 112 106 113 #define udp_test_bit(nr, sk) \ ··· 225 218 #endif 226 219 227 220 #define IS_UDPLITE(__sk) (__sk->sk_protocol == IPPROTO_UDPLITE) 221 + 222 + static inline struct sock *udp_tunnel_sk(const struct net *net, bool is_ipv6) 223 + { 224 + #if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) 225 + return rcu_dereference(net->ipv4.udp_tunnel_gro[is_ipv6].sk); 226 + #else 227 + return NULL; 228 + #endif 229 + } 228 230 229 231 #endif /* _LINUX_UDP_H */
+11
include/net/netns/ipv4.h
··· 47 47 }; 48 48 #endif 49 49 50 + struct udp_tunnel_gro { 51 + struct sock __rcu *sk; 52 + struct hlist_head list; 53 + }; 54 + 50 55 struct netns_ipv4 { 51 56 /* Cacheline organization can be found documented in 52 57 * Documentation/networking/net_cachelines/netns_ipv4_sysctl.rst. ··· 89 84 90 85 struct inet_timewait_death_row tcp_death_row; 91 86 struct udp_table *udp_table; 87 + 88 + #if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) 89 + /* Not in a pernet subsys because need to be available at GRO stage */ 90 + struct udp_tunnel_gro udp_tunnel_gro[2]; 91 + #endif 92 92 93 93 #ifdef CONFIG_SYSCTL 94 94 struct ctl_table_header *forw_hdr; ··· 287 277 struct hlist_head *inet_addr_lst; 288 278 struct delayed_work addr_chk_work; 289 279 }; 280 + 290 281 #endif
+1
include/net/udp.h
··· 290 290 struct udp_sock *up = udp_sk(sk); 291 291 292 292 skb_queue_head_init(&up->reader_queue); 293 + INIT_HLIST_NODE(&up->tunnel_list); 293 294 up->forward_threshold = sk->sk_rcvbuf >> 2; 294 295 set_bit(SOCK_CUSTOM_SOCKOPT, &sk->sk_socket->flags); 295 296 }
+22
include/net/udp_tunnel.h
··· 203 203 udp_encap_enable(); 204 204 } 205 205 206 + #if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) 207 + void udp_tunnel_update_gro_lookup(struct net *net, struct sock *sk, bool add); 208 + void udp_tunnel_update_gro_rcv(struct sock *sk, bool add); 209 + #else 210 + static inline void udp_tunnel_update_gro_lookup(struct net *net, 211 + struct sock *sk, bool add) {} 212 + static inline void udp_tunnel_update_gro_rcv(struct sock *sk, bool add) {} 213 + #endif 214 + 215 + static inline void udp_tunnel_cleanup_gro(struct sock *sk) 216 + { 217 + struct udp_sock *up = udp_sk(sk); 218 + struct net *net = sock_net(sk); 219 + 220 + udp_tunnel_update_gro_rcv(sk, false); 221 + 222 + if (!up->tunnel_list.pprev) 223 + return; 224 + 225 + udp_tunnel_update_gro_lookup(net, sk, false); 226 + } 227 + 206 228 #define UDP_TUNNEL_NIC_MAX_TABLES 4 207 229 208 230 enum udp_tunnel_nic_info_flags {
+12 -1
net/ipv4/udp.c
··· 2891 2891 if (encap_destroy) 2892 2892 encap_destroy(sk); 2893 2893 } 2894 - if (udp_test_bit(ENCAP_ENABLED, sk)) 2894 + if (udp_test_bit(ENCAP_ENABLED, sk)) { 2895 2895 static_branch_dec(&udp_encap_needed_key); 2896 + udp_tunnel_cleanup_gro(sk); 2897 + } 2896 2898 } 2897 2899 } 2898 2900 ··· 3806 3804 3807 3805 static int __net_init udp_pernet_init(struct net *net) 3808 3806 { 3807 + #if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) 3808 + int i; 3809 + 3810 + /* No tunnel is configured */ 3811 + for (i = 0; i < ARRAY_SIZE(net->ipv4.udp_tunnel_gro); ++i) { 3812 + INIT_HLIST_HEAD(&net->ipv4.udp_tunnel_gro[i].list); 3813 + RCU_INIT_POINTER(net->ipv4.udp_tunnel_gro[i].sk, NULL); 3814 + } 3815 + #endif 3809 3816 udp_sysctl_init(net); 3810 3817 udp_set_table(net); 3811 3818
+166 -1
net/ipv4/udp_offload.c
··· 12 12 #include <net/udp.h> 13 13 #include <net/protocol.h> 14 14 #include <net/inet_common.h> 15 + #include <net/udp_tunnel.h> 16 + 17 + #if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) 18 + 19 + /* 20 + * Dummy GRO tunnel callback, exists mainly to avoid dangling/NULL 21 + * values for the udp tunnel static call. 22 + */ 23 + static struct sk_buff *dummy_gro_rcv(struct sock *sk, 24 + struct list_head *head, 25 + struct sk_buff *skb) 26 + { 27 + NAPI_GRO_CB(skb)->flush = 1; 28 + return NULL; 29 + } 30 + 31 + typedef struct sk_buff *(*udp_tunnel_gro_rcv_t)(struct sock *sk, 32 + struct list_head *head, 33 + struct sk_buff *skb); 34 + 35 + struct udp_tunnel_type_entry { 36 + udp_tunnel_gro_rcv_t gro_receive; 37 + refcount_t count; 38 + }; 39 + 40 + #define UDP_MAX_TUNNEL_TYPES (IS_ENABLED(CONFIG_GENEVE) + \ 41 + IS_ENABLED(CONFIG_VXLAN) * 2 + \ 42 + IS_ENABLED(CONFIG_NET_FOU) * 2) 43 + 44 + DEFINE_STATIC_CALL(udp_tunnel_gro_rcv, dummy_gro_rcv); 45 + static DEFINE_STATIC_KEY_FALSE(udp_tunnel_static_call); 46 + static struct mutex udp_tunnel_gro_type_lock; 47 + static struct udp_tunnel_type_entry udp_tunnel_gro_types[UDP_MAX_TUNNEL_TYPES]; 48 + static unsigned int udp_tunnel_gro_type_nr; 49 + static DEFINE_SPINLOCK(udp_tunnel_gro_lock); 50 + 51 + void udp_tunnel_update_gro_lookup(struct net *net, struct sock *sk, bool add) 52 + { 53 + bool is_ipv6 = sk->sk_family == AF_INET6; 54 + struct udp_sock *tup, *up = udp_sk(sk); 55 + struct udp_tunnel_gro *udp_tunnel_gro; 56 + 57 + spin_lock(&udp_tunnel_gro_lock); 58 + udp_tunnel_gro = &net->ipv4.udp_tunnel_gro[is_ipv6]; 59 + if (add) 60 + hlist_add_head(&up->tunnel_list, &udp_tunnel_gro->list); 61 + else 62 + hlist_del_init(&up->tunnel_list); 63 + 64 + if (udp_tunnel_gro->list.first && 65 + !udp_tunnel_gro->list.first->next) { 66 + tup = hlist_entry(udp_tunnel_gro->list.first, struct udp_sock, 67 + tunnel_list); 68 + 69 + rcu_assign_pointer(udp_tunnel_gro->sk, (struct sock *)tup); 70 + } else { 71 + RCU_INIT_POINTER(udp_tunnel_gro->sk, NULL); 72 + } 73 + 74 + spin_unlock(&udp_tunnel_gro_lock); 75 + } 76 + EXPORT_SYMBOL_GPL(udp_tunnel_update_gro_lookup); 77 + 78 + void udp_tunnel_update_gro_rcv(struct sock *sk, bool add) 79 + { 80 + struct udp_tunnel_type_entry *cur = NULL; 81 + struct udp_sock *up = udp_sk(sk); 82 + int i, old_gro_type_nr; 83 + 84 + if (!up->gro_receive) 85 + return; 86 + 87 + mutex_lock(&udp_tunnel_gro_type_lock); 88 + for (i = 0; i < udp_tunnel_gro_type_nr; i++) 89 + if (udp_tunnel_gro_types[i].gro_receive == up->gro_receive) 90 + cur = &udp_tunnel_gro_types[i]; 91 + 92 + old_gro_type_nr = udp_tunnel_gro_type_nr; 93 + if (add) { 94 + /* 95 + * Update the matching entry, if found, or add a new one 96 + * if needed 97 + */ 98 + if (cur) { 99 + refcount_inc(&cur->count); 100 + goto out; 101 + } 102 + 103 + if (unlikely(udp_tunnel_gro_type_nr == UDP_MAX_TUNNEL_TYPES)) { 104 + pr_err_once("Too many UDP tunnel types, please increase UDP_MAX_TUNNEL_TYPES\n"); 105 + /* Ensure static call will never be enabled */ 106 + udp_tunnel_gro_type_nr = UDP_MAX_TUNNEL_TYPES + 2; 107 + goto out; 108 + } 109 + 110 + cur = &udp_tunnel_gro_types[udp_tunnel_gro_type_nr++]; 111 + refcount_set(&cur->count, 1); 112 + cur->gro_receive = up->gro_receive; 113 + } else { 114 + /* 115 + * The stack cleanups only successfully added tunnel, the 116 + * lookup on removal should never fail. 117 + */ 118 + if (WARN_ON_ONCE(!cur)) 119 + goto out; 120 + 121 + if (!refcount_dec_and_test(&cur->count)) 122 + goto out; 123 + 124 + /* avoid gaps, so that the enable tunnel has always id 0 */ 125 + *cur = udp_tunnel_gro_types[--udp_tunnel_gro_type_nr]; 126 + } 127 + 128 + if (udp_tunnel_gro_type_nr == 1) { 129 + static_call_update(udp_tunnel_gro_rcv, 130 + udp_tunnel_gro_types[0].gro_receive); 131 + static_branch_enable(&udp_tunnel_static_call); 132 + } else if (old_gro_type_nr == 1) { 133 + static_branch_disable(&udp_tunnel_static_call); 134 + static_call_update(udp_tunnel_gro_rcv, dummy_gro_rcv); 135 + } 136 + 137 + out: 138 + mutex_unlock(&udp_tunnel_gro_type_lock); 139 + } 140 + EXPORT_SYMBOL_GPL(udp_tunnel_update_gro_rcv); 141 + 142 + static void udp_tunnel_gro_init(void) 143 + { 144 + mutex_init(&udp_tunnel_gro_type_lock); 145 + } 146 + 147 + static struct sk_buff *udp_tunnel_gro_rcv(struct sock *sk, 148 + struct list_head *head, 149 + struct sk_buff *skb) 150 + { 151 + if (static_branch_likely(&udp_tunnel_static_call)) { 152 + if (unlikely(gro_recursion_inc_test(skb))) { 153 + NAPI_GRO_CB(skb)->flush |= 1; 154 + return NULL; 155 + } 156 + return static_call(udp_tunnel_gro_rcv)(sk, head, skb); 157 + } 158 + return call_gro_receive_sk(udp_sk(sk)->gro_receive, sk, head, skb); 159 + } 160 + 161 + #else 162 + 163 + static void udp_tunnel_gro_init(void) {} 164 + 165 + static struct sk_buff *udp_tunnel_gro_rcv(struct sock *sk, 166 + struct list_head *head, 167 + struct sk_buff *skb) 168 + { 169 + return call_gro_receive_sk(udp_sk(sk)->gro_receive, sk, head, skb); 170 + } 171 + 172 + #endif 15 173 16 174 static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, 17 175 netdev_features_t features, ··· 780 622 781 623 skb_gro_pull(skb, sizeof(struct udphdr)); /* pull encapsulating udp header */ 782 624 skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr)); 783 - pp = call_gro_receive_sk(udp_sk(sk)->gro_receive, sk, head, skb); 625 + pp = udp_tunnel_gro_rcv(sk, head, skb); 784 626 785 627 out: 786 628 skb_gro_flush_final(skb, pp, flush); ··· 793 635 { 794 636 const struct iphdr *iph = skb_gro_network_header(skb); 795 637 struct net *net = dev_net_rcu(skb->dev); 638 + struct sock *sk; 796 639 int iif, sdif; 640 + 641 + sk = udp_tunnel_sk(net, false); 642 + if (sk && dport == htons(sk->sk_num)) 643 + return sk; 797 644 798 645 inet_get_iif_sdif(skb, &iif, &sdif); 799 646 ··· 930 767 .gro_complete = udp4_gro_complete, 931 768 }, 932 769 }; 770 + 771 + udp_tunnel_gro_init(); 933 772 return inet_add_offload(&net_hotdata.udpv4_offload, IPPROTO_UDP); 934 773 }
+14
net/ipv4/udp_tunnel_core.c
··· 58 58 } 59 59 EXPORT_SYMBOL(udp_sock_create4); 60 60 61 + static bool sk_saddr_any(struct sock *sk) 62 + { 63 + #if IS_ENABLED(CONFIG_IPV6) 64 + return ipv6_addr_any(&sk->sk_v6_rcv_saddr); 65 + #else 66 + return !sk->sk_rcv_saddr; 67 + #endif 68 + } 69 + 61 70 void setup_udp_tunnel_sock(struct net *net, struct socket *sock, 62 71 struct udp_tunnel_sock_cfg *cfg) 63 72 { ··· 89 80 udp_sk(sk)->gro_complete = cfg->gro_complete; 90 81 91 82 udp_tunnel_encap_enable(sk); 83 + 84 + udp_tunnel_update_gro_rcv(sock->sk, true); 85 + 86 + if (!sk->sk_dport && !sk->sk_bound_dev_if && sk_saddr_any(sock->sk)) 87 + udp_tunnel_update_gro_lookup(net, sock->sk, true); 92 88 } 93 89 EXPORT_SYMBOL_GPL(setup_udp_tunnel_sock); 94 90
+2
net/ipv6/udp.c
··· 46 46 #include <net/tcp_states.h> 47 47 #include <net/ip6_checksum.h> 48 48 #include <net/ip6_tunnel.h> 49 + #include <net/udp_tunnel.h> 49 50 #include <net/xfrm.h> 50 51 #include <net/inet_hashtables.h> 51 52 #include <net/inet6_hashtables.h> ··· 1826 1825 if (udp_test_bit(ENCAP_ENABLED, sk)) { 1827 1826 static_branch_dec(&udpv6_encap_needed_key); 1828 1827 udp_encap_disable(); 1828 + udp_tunnel_cleanup_gro(sk); 1829 1829 } 1830 1830 } 1831 1831 }
+5
net/ipv6/udp_offload.c
··· 118 118 { 119 119 const struct ipv6hdr *iph = skb_gro_network_header(skb); 120 120 struct net *net = dev_net_rcu(skb->dev); 121 + struct sock *sk; 121 122 int iif, sdif; 123 + 124 + sk = udp_tunnel_sk(net, true); 125 + if (sk && dport == htons(sk->sk_num)) 126 + return sk; 122 127 123 128 inet6_get_iif_sdif(skb, &iif, &sdif); 124 129