Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

net: Add sysctl to toggle early demux for tcp and udp

Certain system process significant unconnected UDP workload.
It would be preferrable to disable UDP early demux for those systems
and enable it for TCP only.

By disabling UDP demux, we see these slight gains on an ARM64 system-
782 -> 788Mbps unconnected single stream UDPv4
633 -> 654Mbps unconnected UDPv4 different sources

The performance impact can change based on CPU architecure and cache
sizes. There will not much difference seen if entire UDP hash table
is in cache.

Both sysctls are enabled by default to preserve existing behavior.

v1->v2: Change function pointer instead of adding conditional as
suggested by Stephen.

v2->v3: Read once in callers to avoid issues due to compiler
optimizations. Also update commit message with the tests.

v3->v4: Store and use read once result instead of querying pointer
again incorrectly.

v4->v5: Refactor to avoid errors due to compilation with IPV6={m,n}

Signed-off-by: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
Suggested-by: Eric Dumazet <edumazet@google.com>
Cc: Stephen Hemminger <stephen@networkplumber.org>
Cc: Tom Herbert <tom@herbertland.com>
Cc: David Miller <davem@davemloft.net>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

subashab@codeaurora.org and committed by
David S. Miller
dddb64bc 8fa96e3b

+103 -14
+10 -1
Documentation/networking/ip-sysctl.txt
··· 856 856 ip_early_demux - BOOLEAN 857 857 Optimize input packet processing down to one demux for 858 858 certain kinds of local sockets. Currently we only do this 859 - for established TCP sockets. 859 + for established TCP and connected UDP sockets. 860 860 861 861 It may add an additional cost for pure routing workloads that 862 862 reduces overall throughput, in such case you should disable it. 863 + Default: 1 864 + 865 + tcp_early_demux - BOOLEAN 866 + Enable early demux for established TCP sockets. 867 + Default: 1 868 + 869 + udp_early_demux - BOOLEAN 870 + Enable early demux for connected UDP sockets. Disable this if 871 + your system could experience more unconnected load. 863 872 Default: 1 864 873 865 874 icmp_echo_ignore_all - BOOLEAN
+2
include/net/netns/ipv4.h
··· 95 95 /* Shall we try to damage output packets if routing dev changes? */ 96 96 int sysctl_ip_dynaddr; 97 97 int sysctl_ip_early_demux; 98 + int sysctl_tcp_early_demux; 99 + int sysctl_udp_early_demux; 98 100 99 101 int sysctl_fwmark_reflect; 100 102 int sysctl_tcp_fwmark_accept;
+4 -3
include/net/protocol.h
··· 40 40 /* This is used to register protocols. */ 41 41 struct net_protocol { 42 42 void (*early_demux)(struct sk_buff *skb); 43 + void (*early_demux_handler)(struct sk_buff *skb); 43 44 int (*handler)(struct sk_buff *skb); 44 45 void (*err_handler)(struct sk_buff *skb, u32 info); 45 46 unsigned int no_policy:1, ··· 55 54 #if IS_ENABLED(CONFIG_IPV6) 56 55 struct inet6_protocol { 57 56 void (*early_demux)(struct sk_buff *skb); 58 - 57 + void (*early_demux_handler)(struct sk_buff *skb); 59 58 int (*handler)(struct sk_buff *skb); 60 59 61 60 void (*err_handler)(struct sk_buff *skb, ··· 93 92 #define INET_PROTOSW_PERMANENT 0x02 /* Permanent protocols are unremovable. */ 94 93 #define INET_PROTOSW_ICSK 0x04 /* Is this an inet_connection_sock? */ 95 94 96 - extern const struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS]; 95 + extern struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS]; 97 96 extern const struct net_offload __rcu *inet_offloads[MAX_INET_PROTOS]; 98 97 extern const struct net_offload __rcu *inet6_offloads[MAX_INET_PROTOS]; 99 98 100 99 #if IS_ENABLED(CONFIG_IPV6) 101 - extern const struct inet6_protocol __rcu *inet6_protos[MAX_INET_PROTOS]; 100 + extern struct inet6_protocol __rcu *inet6_protos[MAX_INET_PROTOS]; 102 101 #endif 103 102 104 103 int inet_add_protocol(const struct net_protocol *prot, unsigned char num);
+1
include/net/udp.h
··· 372 372 #if IS_ENABLED(CONFIG_IPV6) 373 373 void udpv6_encap_enable(void); 374 374 #endif 375 + 375 376 #endif /* _UDP_H */
+6 -2
net/ipv4/af_inet.c
··· 1599 1599 }; 1600 1600 #endif 1601 1601 1602 - static const struct net_protocol tcp_protocol = { 1602 + static struct net_protocol tcp_protocol = { 1603 1603 .early_demux = tcp_v4_early_demux, 1604 + .early_demux_handler = tcp_v4_early_demux, 1604 1605 .handler = tcp_v4_rcv, 1605 1606 .err_handler = tcp_v4_err, 1606 1607 .no_policy = 1, ··· 1609 1608 .icmp_strict_tag_validation = 1, 1610 1609 }; 1611 1610 1612 - static const struct net_protocol udp_protocol = { 1611 + static struct net_protocol udp_protocol = { 1613 1612 .early_demux = udp_v4_early_demux, 1613 + .early_demux_handler = udp_v4_early_demux, 1614 1614 .handler = udp_rcv, 1615 1615 .err_handler = udp_err, 1616 1616 .no_policy = 1, ··· 1722 1720 net->ipv4.sysctl_ip_default_ttl = IPDEFTTL; 1723 1721 net->ipv4.sysctl_ip_dynaddr = 0; 1724 1722 net->ipv4.sysctl_ip_early_demux = 1; 1723 + net->ipv4.sysctl_udp_early_demux = 1; 1724 + net->ipv4.sysctl_tcp_early_demux = 1; 1725 1725 #ifdef CONFIG_SYSCTL 1726 1726 net->ipv4.sysctl_ip_prot_sock = PROT_SOCK; 1727 1727 #endif
+3 -2
net/ipv4/ip_input.c
··· 313 313 const struct iphdr *iph = ip_hdr(skb); 314 314 struct rtable *rt; 315 315 struct net_device *dev = skb->dev; 316 + void (*edemux)(struct sk_buff *skb); 316 317 317 318 /* if ingress device is enslaved to an L3 master device pass the 318 319 * skb to its handler for processing ··· 330 329 int protocol = iph->protocol; 331 330 332 331 ipprot = rcu_dereference(inet_protos[protocol]); 333 - if (ipprot && ipprot->early_demux) { 334 - ipprot->early_demux(skb); 332 + if (ipprot && (edemux = READ_ONCE(ipprot->early_demux))) { 333 + edemux(skb); 335 334 /* must reload iph, skb->head might have changed */ 336 335 iph = ip_hdr(skb); 337 336 }
+1 -1
net/ipv4/protocol.c
··· 28 28 #include <linux/spinlock.h> 29 29 #include <net/protocol.h> 30 30 31 - const struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly; 31 + struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly; 32 32 const struct net_offload __rcu *inet_offloads[MAX_INET_PROTOS] __read_mostly; 33 33 EXPORT_SYMBOL(inet_offloads); 34 34
+67
net/ipv4/sysctl_net_ipv4.c
··· 24 24 #include <net/cipso_ipv4.h> 25 25 #include <net/inet_frag.h> 26 26 #include <net/ping.h> 27 + #include <net/protocol.h> 27 28 28 29 static int zero; 29 30 static int one = 1; ··· 292 291 user_key[0], user_key[1], user_key[2], user_key[3], 293 292 (char *)tbl.data, ret); 294 293 kfree(tbl.data); 294 + return ret; 295 + } 296 + 297 + static void proc_configure_early_demux(int enabled, int protocol) 298 + { 299 + struct net_protocol *ipprot; 300 + #if IS_ENABLED(CONFIG_IPV6) 301 + struct inet6_protocol *ip6prot; 302 + #endif 303 + 304 + ipprot = rcu_dereference(inet_protos[protocol]); 305 + if (ipprot) 306 + ipprot->early_demux = enabled ? ipprot->early_demux_handler : 307 + NULL; 308 + 309 + #if IS_ENABLED(CONFIG_IPV6) 310 + ip6prot = rcu_dereference(inet6_protos[protocol]); 311 + if (ip6prot) 312 + ip6prot->early_demux = enabled ? ip6prot->early_demux_handler : 313 + NULL; 314 + #endif 315 + } 316 + 317 + static int proc_tcp_early_demux(struct ctl_table *table, int write, 318 + void __user *buffer, size_t *lenp, loff_t *ppos) 319 + { 320 + int ret = 0; 321 + 322 + ret = proc_dointvec(table, write, buffer, lenp, ppos); 323 + 324 + if (write && !ret) { 325 + int enabled = init_net.ipv4.sysctl_tcp_early_demux; 326 + 327 + proc_configure_early_demux(enabled, IPPROTO_TCP); 328 + } 329 + 330 + return ret; 331 + } 332 + 333 + static int proc_udp_early_demux(struct ctl_table *table, int write, 334 + void __user *buffer, size_t *lenp, loff_t *ppos) 335 + { 336 + int ret = 0; 337 + 338 + ret = proc_dointvec(table, write, buffer, lenp, ppos); 339 + 340 + if (write && !ret) { 341 + int enabled = init_net.ipv4.sysctl_udp_early_demux; 342 + 343 + proc_configure_early_demux(enabled, IPPROTO_UDP); 344 + } 345 + 295 346 return ret; 296 347 } 297 348 ··· 801 748 .maxlen = sizeof(int), 802 749 .mode = 0644, 803 750 .proc_handler = proc_dointvec 751 + }, 752 + { 753 + .procname = "udp_early_demux", 754 + .data = &init_net.ipv4.sysctl_udp_early_demux, 755 + .maxlen = sizeof(int), 756 + .mode = 0644, 757 + .proc_handler = proc_udp_early_demux 758 + }, 759 + { 760 + .procname = "tcp_early_demux", 761 + .data = &init_net.ipv4.sysctl_tcp_early_demux, 762 + .maxlen = sizeof(int), 763 + .mode = 0644, 764 + .proc_handler = proc_tcp_early_demux 804 765 }, 805 766 { 806 767 .procname = "ip_default_ttl",
+4 -2
net/ipv6/ip6_input.c
··· 49 49 50 50 int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) 51 51 { 52 + void (*edemux)(struct sk_buff *skb); 53 + 52 54 /* if ingress device is enslaved to an L3 master device pass the 53 55 * skb to its handler for processing 54 56 */ ··· 62 60 const struct inet6_protocol *ipprot; 63 61 64 62 ipprot = rcu_dereference(inet6_protos[ipv6_hdr(skb)->nexthdr]); 65 - if (ipprot && ipprot->early_demux) 66 - ipprot->early_demux(skb); 63 + if (ipprot && (edemux = READ_ONCE(ipprot->early_demux))) 64 + edemux(skb); 67 65 } 68 66 if (!skb_valid_dst(skb)) 69 67 ip6_route_input(skb);
+1 -1
net/ipv6/protocol.c
··· 26 26 #include <net/protocol.h> 27 27 28 28 #if IS_ENABLED(CONFIG_IPV6) 29 - const struct inet6_protocol __rcu *inet6_protos[MAX_INET_PROTOS] __read_mostly; 29 + struct inet6_protocol __rcu *inet6_protos[MAX_INET_PROTOS] __read_mostly; 30 30 EXPORT_SYMBOL(inet6_protos); 31 31 32 32 int inet6_add_protocol(const struct inet6_protocol *prot, unsigned char protocol)
+2 -1
net/ipv6/tcp_ipv6.c
··· 1925 1925 .diag_destroy = tcp_abort, 1926 1926 }; 1927 1927 1928 - static const struct inet6_protocol tcpv6_protocol = { 1928 + static struct inet6_protocol tcpv6_protocol = { 1929 1929 .early_demux = tcp_v6_early_demux, 1930 + .early_demux_handler = tcp_v6_early_demux, 1930 1931 .handler = tcp_v6_rcv, 1931 1932 .err_handler = tcp_v6_err, 1932 1933 .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
+2 -1
net/ipv6/udp.c
··· 1436 1436 } 1437 1437 #endif 1438 1438 1439 - static const struct inet6_protocol udpv6_protocol = { 1439 + static struct inet6_protocol udpv6_protocol = { 1440 1440 .early_demux = udp_v6_early_demux, 1441 + .early_demux_handler = udp_v6_early_demux, 1441 1442 .handler = udpv6_rcv, 1442 1443 .err_handler = udpv6_err, 1443 1444 .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,