Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf

Pablo Neira Ayuso says:

====================
Netfilter fixes for net

1) Protect nft_ct template with global mutex, from Pavel Skripkin.

2) Two recent commits switched inet rt and nexthop exception hashes
from jhash to siphash. If those two spots are problematic then
conntrack is affected as well, so switch voer to siphash too.
While at it, add a hard upper limit on chain lengths and reject
insertion if this is hit. Patches from Florian Westphal.

3) Fix use-after-scope in nf_socket_ipv6 reported by KASAN,
from Benjamin Hesmans.

* git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf:
netfilter: socket: icmp6: fix use-after-scope
netfilter: refuse insertion if chain has grown too large
netfilter: conntrack: switch to siphash
netfilter: conntrack: sanitize table size default settings
netfilter: nft_ct: protect nft_ct_pcpu_template_refcnt with mutex
====================

Link: https://lore.kernel.org/r/20210903163020.13741-1-pablo@netfilter.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

+122 -58
+8 -5
Documentation/networking/nf_conntrack-sysctl.rst
··· 17 17 nf_conntrack_buckets - INTEGER 18 18 Size of hash table. If not specified as parameter during module 19 19 loading, the default size is calculated by dividing total memory 20 - by 16384 to determine the number of buckets but the hash table will 21 - never have fewer than 32 and limited to 16384 buckets. For systems 22 - with more than 4GB of memory it will be 65536 buckets. 20 + by 16384 to determine the number of buckets. The hash table will 21 + never have fewer than 1024 and never more than 262144 buckets. 23 22 This sysctl is only writeable in the initial net namespace. 24 23 25 24 nf_conntrack_checksum - BOOLEAN ··· 99 100 Log invalid packets of a type specified by value. 100 101 101 102 nf_conntrack_max - INTEGER 102 - Size of connection tracking table. Default value is 103 - nf_conntrack_buckets value * 4. 103 + Maximum number of allowed connection tracking entries. This value is set 104 + to nf_conntrack_buckets by default. 105 + Note that connection tracking entries are added to the table twice -- once 106 + for the original direction and once for the reply direction (i.e., with 107 + the reversed address). This means that with default settings a maxed-out 108 + table will have a average hash chain length of 2, not 1. 104 109 105 110 nf_conntrack_tcp_be_liberal - BOOLEAN 106 111 - 0 - disabled (default)
+1
include/linux/netfilter/nf_conntrack_common.h
··· 18 18 unsigned int expect_create; 19 19 unsigned int expect_delete; 20 20 unsigned int search_restart; 21 + unsigned int chaintoolong; 21 22 }; 22 23 23 24 #define NFCT_INFOMASK 7UL
+1 -3
net/ipv6/netfilter/nf_socket_ipv6.c
··· 99 99 { 100 100 __be16 dport, sport; 101 101 const struct in6_addr *daddr = NULL, *saddr = NULL; 102 - struct ipv6hdr *iph = ipv6_hdr(skb); 102 + struct ipv6hdr *iph = ipv6_hdr(skb), ipv6_var; 103 103 struct sk_buff *data_skb = NULL; 104 104 int doff = 0; 105 105 int thoff = 0, tproto; ··· 129 129 thoff + sizeof(*hp); 130 130 131 131 } else if (tproto == IPPROTO_ICMPV6) { 132 - struct ipv6hdr ipv6_var; 133 - 134 132 if (extract_icmp6_fields(skb, thoff, &tproto, &saddr, &daddr, 135 133 &sport, &dport, &ipv6_var)) 136 134 return NULL;
+66 -35
net/netfilter/nf_conntrack_core.c
··· 21 21 #include <linux/stddef.h> 22 22 #include <linux/slab.h> 23 23 #include <linux/random.h> 24 - #include <linux/jhash.h> 25 24 #include <linux/siphash.h> 26 25 #include <linux/err.h> 27 26 #include <linux/percpu.h> ··· 76 77 77 78 #define GC_SCAN_INTERVAL (120u * HZ) 78 79 #define GC_SCAN_MAX_DURATION msecs_to_jiffies(10) 80 + 81 + #define MAX_CHAINLEN 64u 79 82 80 83 static struct conntrack_gc_work conntrack_gc_work; 81 84 ··· 185 184 unsigned int nf_conntrack_max __read_mostly; 186 185 EXPORT_SYMBOL_GPL(nf_conntrack_max); 187 186 seqcount_spinlock_t nf_conntrack_generation __read_mostly; 188 - static unsigned int nf_conntrack_hash_rnd __read_mostly; 187 + static siphash_key_t nf_conntrack_hash_rnd __read_mostly; 189 188 190 189 static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, 191 190 const struct net *net) 192 191 { 193 - unsigned int n; 194 - u32 seed; 192 + struct { 193 + struct nf_conntrack_man src; 194 + union nf_inet_addr dst_addr; 195 + u32 net_mix; 196 + u16 dport; 197 + u16 proto; 198 + } __aligned(SIPHASH_ALIGNMENT) combined; 195 199 196 200 get_random_once(&nf_conntrack_hash_rnd, sizeof(nf_conntrack_hash_rnd)); 197 201 198 - /* The direction must be ignored, so we hash everything up to the 199 - * destination ports (which is a multiple of 4) and treat the last 200 - * three bytes manually. 201 - */ 202 - seed = nf_conntrack_hash_rnd ^ net_hash_mix(net); 203 - n = (sizeof(tuple->src) + sizeof(tuple->dst.u3)) / sizeof(u32); 204 - return jhash2((u32 *)tuple, n, seed ^ 205 - (((__force __u16)tuple->dst.u.all << 16) | 206 - tuple->dst.protonum)); 202 + memset(&combined, 0, sizeof(combined)); 203 + 204 + /* The direction must be ignored, so handle usable members manually. */ 205 + combined.src = tuple->src; 206 + combined.dst_addr = tuple->dst.u3; 207 + combined.net_mix = net_hash_mix(net); 208 + combined.dport = (__force __u16)tuple->dst.u.all; 209 + combined.proto = tuple->dst.protonum; 210 + 211 + return (u32)siphash(&combined, sizeof(combined), &nf_conntrack_hash_rnd); 207 212 } 208 213 209 214 static u32 scale_hash(u32 hash) ··· 842 835 unsigned int hash, reply_hash; 843 836 struct nf_conntrack_tuple_hash *h; 844 837 struct hlist_nulls_node *n; 838 + unsigned int chainlen = 0; 845 839 unsigned int sequence; 840 + int err = -EEXIST; 846 841 847 842 zone = nf_ct_zone(ct); 848 843 ··· 858 849 } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); 859 850 860 851 /* See if there's one in the list already, including reverse */ 861 - hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) 852 + hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) { 862 853 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 863 854 zone, net)) 864 855 goto out; 865 856 866 - hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) 857 + if (chainlen++ > MAX_CHAINLEN) 858 + goto chaintoolong; 859 + } 860 + 861 + chainlen = 0; 862 + 863 + hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) { 867 864 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, 868 865 zone, net)) 869 866 goto out; 867 + if (chainlen++ > MAX_CHAINLEN) 868 + goto chaintoolong; 869 + } 870 870 871 871 smp_wmb(); 872 872 /* The caller holds a reference to this object */ ··· 885 867 NF_CT_STAT_INC(net, insert); 886 868 local_bh_enable(); 887 869 return 0; 888 - 870 + chaintoolong: 871 + NF_CT_STAT_INC(net, chaintoolong); 872 + err = -ENOSPC; 889 873 out: 890 874 nf_conntrack_double_unlock(hash, reply_hash); 891 875 local_bh_enable(); 892 - return -EEXIST; 876 + return err; 893 877 } 894 878 EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert); 895 879 ··· 1104 1084 __nf_conntrack_confirm(struct sk_buff *skb) 1105 1085 { 1106 1086 const struct nf_conntrack_zone *zone; 1087 + unsigned int chainlen = 0, sequence; 1107 1088 unsigned int hash, reply_hash; 1108 1089 struct nf_conntrack_tuple_hash *h; 1109 1090 struct nf_conn *ct; ··· 1112 1091 struct hlist_nulls_node *n; 1113 1092 enum ip_conntrack_info ctinfo; 1114 1093 struct net *net; 1115 - unsigned int sequence; 1116 1094 int ret = NF_DROP; 1117 1095 1118 1096 ct = nf_ct_get(skb, &ctinfo); ··· 1171 1151 /* See if there's one in the list already, including reverse: 1172 1152 NAT could have grabbed it without realizing, since we're 1173 1153 not in the hash. If there is, we lost race. */ 1174 - hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) 1154 + hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) { 1175 1155 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 1176 1156 zone, net)) 1177 1157 goto out; 1158 + if (chainlen++ > MAX_CHAINLEN) 1159 + goto chaintoolong; 1160 + } 1178 1161 1179 - hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) 1162 + chainlen = 0; 1163 + hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) { 1180 1164 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, 1181 1165 zone, net)) 1182 1166 goto out; 1167 + if (chainlen++ > MAX_CHAINLEN) { 1168 + chaintoolong: 1169 + nf_ct_add_to_dying_list(ct); 1170 + NF_CT_STAT_INC(net, chaintoolong); 1171 + NF_CT_STAT_INC(net, insert_failed); 1172 + ret = NF_DROP; 1173 + goto dying; 1174 + } 1175 + } 1183 1176 1184 1177 /* Timer relative to confirmation time, not original 1185 1178 setting time, otherwise we'd get timer wrap in ··· 2627 2594 spin_lock_init(&nf_conntrack_locks[i]); 2628 2595 2629 2596 if (!nf_conntrack_htable_size) { 2630 - /* Idea from tcp.c: use 1/16384 of memory. 2631 - * On i386: 32MB machine has 512 buckets. 2632 - * >= 1GB machines have 16384 buckets. 2633 - * >= 4GB machines have 65536 buckets. 2634 - */ 2635 2597 nf_conntrack_htable_size 2636 2598 = (((nr_pages << PAGE_SHIFT) / 16384) 2637 2599 / sizeof(struct hlist_head)); 2638 - if (nr_pages > (4 * (1024 * 1024 * 1024 / PAGE_SIZE))) 2639 - nf_conntrack_htable_size = 65536; 2600 + if (BITS_PER_LONG >= 64 && 2601 + nr_pages > (4 * (1024 * 1024 * 1024 / PAGE_SIZE))) 2602 + nf_conntrack_htable_size = 262144; 2640 2603 else if (nr_pages > (1024 * 1024 * 1024 / PAGE_SIZE)) 2641 - nf_conntrack_htable_size = 16384; 2642 - if (nf_conntrack_htable_size < 32) 2643 - nf_conntrack_htable_size = 32; 2604 + nf_conntrack_htable_size = 65536; 2644 2605 2645 - /* Use a max. factor of four by default to get the same max as 2646 - * with the old struct list_heads. When a table size is given 2647 - * we use the old value of 8 to avoid reducing the max. 2648 - * entries. */ 2649 - max_factor = 4; 2606 + if (nf_conntrack_htable_size < 1024) 2607 + nf_conntrack_htable_size = 1024; 2608 + /* Use a max. factor of one by default to keep the average 2609 + * hash chain length at 2 entries. Each entry has to be added 2610 + * twice (once for original direction, once for reply). 2611 + * When a table size is given we use the old value of 8 to 2612 + * avoid implicit reduction of the max entries setting. 2613 + */ 2614 + max_factor = 1; 2650 2615 } 2651 2616 2652 2617 nf_conntrack_hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, 1);
+18 -7
net/netfilter/nf_conntrack_expect.c
··· 17 17 #include <linux/err.h> 18 18 #include <linux/percpu.h> 19 19 #include <linux/kernel.h> 20 - #include <linux/jhash.h> 20 + #include <linux/siphash.h> 21 21 #include <linux/moduleparam.h> 22 22 #include <linux/export.h> 23 23 #include <net/net_namespace.h> ··· 41 41 unsigned int nf_ct_expect_max __read_mostly; 42 42 43 43 static struct kmem_cache *nf_ct_expect_cachep __read_mostly; 44 - static unsigned int nf_ct_expect_hashrnd __read_mostly; 44 + static siphash_key_t nf_ct_expect_hashrnd __read_mostly; 45 45 46 46 /* nf_conntrack_expect helper functions */ 47 47 void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp, ··· 81 81 82 82 static unsigned int nf_ct_expect_dst_hash(const struct net *n, const struct nf_conntrack_tuple *tuple) 83 83 { 84 - unsigned int hash, seed; 84 + struct { 85 + union nf_inet_addr dst_addr; 86 + u32 net_mix; 87 + u16 dport; 88 + u8 l3num; 89 + u8 protonum; 90 + } __aligned(SIPHASH_ALIGNMENT) combined; 91 + u32 hash; 85 92 86 93 get_random_once(&nf_ct_expect_hashrnd, sizeof(nf_ct_expect_hashrnd)); 87 94 88 - seed = nf_ct_expect_hashrnd ^ net_hash_mix(n); 95 + memset(&combined, 0, sizeof(combined)); 89 96 90 - hash = jhash2(tuple->dst.u3.all, ARRAY_SIZE(tuple->dst.u3.all), 91 - (((tuple->dst.protonum ^ tuple->src.l3num) << 16) | 92 - (__force __u16)tuple->dst.u.all) ^ seed); 97 + combined.dst_addr = tuple->dst.u3; 98 + combined.net_mix = net_hash_mix(n); 99 + combined.dport = (__force __u16)tuple->dst.u.all; 100 + combined.l3num = tuple->src.l3num; 101 + combined.protonum = tuple->dst.protonum; 102 + 103 + hash = siphash(&combined, sizeof(combined), &nf_ct_expect_hashrnd); 93 104 94 105 return reciprocal_scale(hash, nf_ct_expect_hsize); 95 106 }
+3 -1
net/netfilter/nf_conntrack_netlink.c
··· 2528 2528 nla_put_be32(skb, CTA_STATS_SEARCH_RESTART, 2529 2529 htonl(st->search_restart)) || 2530 2530 nla_put_be32(skb, CTA_STATS_CLASH_RESOLVE, 2531 - htonl(st->clash_resolve))) 2531 + htonl(st->clash_resolve)) || 2532 + nla_put_be32(skb, CTA_STATS_CHAIN_TOOLONG, 2533 + htonl(st->chaintoolong))) 2532 2534 goto nla_put_failure; 2533 2535 2534 2536 nlmsg_end(skb, nlh);
+2 -2
net/netfilter/nf_conntrack_standalone.c
··· 432 432 unsigned int nr_conntracks; 433 433 434 434 if (v == SEQ_START_TOKEN) { 435 - seq_puts(seq, "entries clashres found new invalid ignore delete delete_list insert insert_failed drop early_drop icmp_error expect_new expect_create expect_delete search_restart\n"); 435 + seq_puts(seq, "entries clashres found new invalid ignore delete chainlength insert insert_failed drop early_drop icmp_error expect_new expect_create expect_delete search_restart\n"); 436 436 return 0; 437 437 } 438 438 ··· 447 447 st->invalid, 448 448 0, 449 449 0, 450 - 0, 450 + st->chaintoolong, 451 451 st->insert, 452 452 st->insert_failed, 453 453 st->drop,
+14 -4
net/netfilter/nf_nat_core.c
··· 13 13 #include <linux/skbuff.h> 14 14 #include <linux/gfp.h> 15 15 #include <net/xfrm.h> 16 - #include <linux/jhash.h> 16 + #include <linux/siphash.h> 17 17 #include <linux/rtnetlink.h> 18 18 19 19 #include <net/netfilter/nf_conntrack.h> ··· 34 34 35 35 static struct hlist_head *nf_nat_bysource __read_mostly; 36 36 static unsigned int nf_nat_htable_size __read_mostly; 37 - static unsigned int nf_nat_hash_rnd __read_mostly; 37 + static siphash_key_t nf_nat_hash_rnd __read_mostly; 38 38 39 39 struct nf_nat_lookup_hook_priv { 40 40 struct nf_hook_entries __rcu *entries; ··· 153 153 hash_by_src(const struct net *n, const struct nf_conntrack_tuple *tuple) 154 154 { 155 155 unsigned int hash; 156 + struct { 157 + struct nf_conntrack_man src; 158 + u32 net_mix; 159 + u32 protonum; 160 + } __aligned(SIPHASH_ALIGNMENT) combined; 156 161 157 162 get_random_once(&nf_nat_hash_rnd, sizeof(nf_nat_hash_rnd)); 158 163 164 + memset(&combined, 0, sizeof(combined)); 165 + 159 166 /* Original src, to ensure we map it consistently if poss. */ 160 - hash = jhash2((u32 *)&tuple->src, sizeof(tuple->src) / sizeof(u32), 161 - tuple->dst.protonum ^ nf_nat_hash_rnd ^ net_hash_mix(n)); 167 + combined.src = tuple->src; 168 + combined.net_mix = net_hash_mix(n); 169 + combined.protonum = tuple->dst.protonum; 170 + 171 + hash = siphash(&combined, sizeof(combined), &nf_nat_hash_rnd); 162 172 163 173 return reciprocal_scale(hash, nf_nat_htable_size); 164 174 }
+8 -1
net/netfilter/nft_ct.c
··· 41 41 #ifdef CONFIG_NF_CONNTRACK_ZONES 42 42 static DEFINE_PER_CPU(struct nf_conn *, nft_ct_pcpu_template); 43 43 static unsigned int nft_ct_pcpu_template_refcnt __read_mostly; 44 + static DEFINE_MUTEX(nft_ct_pcpu_mutex); 44 45 #endif 45 46 46 47 static u64 nft_ct_get_eval_counter(const struct nf_conn_counter *c, ··· 526 525 #endif 527 526 #ifdef CONFIG_NF_CONNTRACK_ZONES 528 527 case NFT_CT_ZONE: 528 + mutex_lock(&nft_ct_pcpu_mutex); 529 529 if (--nft_ct_pcpu_template_refcnt == 0) 530 530 nft_ct_tmpl_put_pcpu(); 531 + mutex_unlock(&nft_ct_pcpu_mutex); 531 532 break; 532 533 #endif 533 534 default: ··· 567 564 #endif 568 565 #ifdef CONFIG_NF_CONNTRACK_ZONES 569 566 case NFT_CT_ZONE: 570 - if (!nft_ct_tmpl_alloc_pcpu()) 567 + mutex_lock(&nft_ct_pcpu_mutex); 568 + if (!nft_ct_tmpl_alloc_pcpu()) { 569 + mutex_unlock(&nft_ct_pcpu_mutex); 571 570 return -ENOMEM; 571 + } 572 572 nft_ct_pcpu_template_refcnt++; 573 + mutex_unlock(&nft_ct_pcpu_mutex); 573 574 len = sizeof(u16); 574 575 break; 575 576 #endif