Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

netfilter: conntrack: use a single hashtable for all namespaces

We already include netns address in the hash and compare the netns pointers
during lookup, so even if namespaces have overlapping addresses entries
will be spread across the table.

Assuming 64k bucket size, this change saves 0.5 mbyte per namespace on a
64bit system.

NAT bysrc and expectation hash is still per namespace, those will
changed too soon.

Future patch will also make conntrack object slab cache global again.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>

authored by

Florian Westphal and committed by
Pablo Neira Ayuso
56d52d48 1b8c8a9f

+62 -68
+1
include/net/netfilter/nf_conntrack_core.h
··· 81 81 82 82 #define CONNTRACK_LOCKS 1024 83 83 84 + extern struct hlist_nulls_head *nf_conntrack_hash; 84 85 extern spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS]; 85 86 void nf_conntrack_lock(spinlock_t *lock); 86 87
-2
include/net/netns/conntrack.h
··· 93 93 int sysctl_tstamp; 94 94 int sysctl_checksum; 95 95 96 - unsigned int htable_size; 97 96 struct kmem_cache *nf_conntrack_cachep; 98 - struct hlist_nulls_head *hash; 99 97 struct hlist_head *expect_hash; 100 98 struct ct_pcpu __percpu *pcpu_lists; 101 99 struct ip_conntrack_stat __percpu *stat;
+1 -1
net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
··· 360 360 361 361 in->ctl_table[0].data = &nf_conntrack_max; 362 362 in->ctl_table[1].data = &net->ct.count; 363 - in->ctl_table[2].data = &net->ct.htable_size; 363 + in->ctl_table[2].data = &nf_conntrack_htable_size; 364 364 in->ctl_table[3].data = &net->ct.sysctl_checksum; 365 365 in->ctl_table[4].data = &net->ct.sysctl_log_invalid; 366 366 #endif
+4 -6
net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
··· 31 31 32 32 static struct hlist_nulls_node *ct_get_first(struct seq_file *seq) 33 33 { 34 - struct net *net = seq_file_net(seq); 35 34 struct ct_iter_state *st = seq->private; 36 35 struct hlist_nulls_node *n; 37 36 38 37 for (st->bucket = 0; 39 - st->bucket < net->ct.htable_size; 38 + st->bucket < nf_conntrack_htable_size; 40 39 st->bucket++) { 41 40 n = rcu_dereference( 42 - hlist_nulls_first_rcu(&net->ct.hash[st->bucket])); 41 + hlist_nulls_first_rcu(&nf_conntrack_hash[st->bucket])); 43 42 if (!is_a_nulls(n)) 44 43 return n; 45 44 } ··· 48 49 static struct hlist_nulls_node *ct_get_next(struct seq_file *seq, 49 50 struct hlist_nulls_node *head) 50 51 { 51 - struct net *net = seq_file_net(seq); 52 52 struct ct_iter_state *st = seq->private; 53 53 54 54 head = rcu_dereference(hlist_nulls_next_rcu(head)); 55 55 while (is_a_nulls(head)) { 56 56 if (likely(get_nulls_value(head) == st->bucket)) { 57 - if (++st->bucket >= net->ct.htable_size) 57 + if (++st->bucket >= nf_conntrack_htable_size) 58 58 return NULL; 59 59 } 60 60 head = rcu_dereference( 61 - hlist_nulls_first_rcu(&net->ct.hash[st->bucket])); 61 + hlist_nulls_first_rcu(&nf_conntrack_hash[st->bucket])); 62 62 } 63 63 return head; 64 64 }
+40 -40
net/netfilter/nf_conntrack_core.c
··· 69 69 __cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock); 70 70 EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock); 71 71 72 + struct hlist_nulls_head *nf_conntrack_hash __read_mostly; 73 + EXPORT_SYMBOL_GPL(nf_conntrack_hash); 74 + 72 75 static __read_mostly spinlock_t nf_conntrack_locks_all_lock; 73 76 static __read_mostly seqcount_t nf_conntrack_generation; 74 77 static __read_mostly bool nf_conntrack_locks_all; ··· 167 164 tuple->dst.protonum)); 168 165 } 169 166 170 - static u32 hash_bucket(u32 hash, const struct net *net) 167 + static u32 scale_hash(u32 hash) 171 168 { 172 - return reciprocal_scale(hash, net->ct.htable_size); 169 + return reciprocal_scale(hash, nf_conntrack_htable_size); 173 170 } 174 171 175 172 static u32 __hash_conntrack(const struct net *net, ··· 182 179 static u32 hash_conntrack(const struct net *net, 183 180 const struct nf_conntrack_tuple *tuple) 184 181 { 185 - return __hash_conntrack(net, tuple, net->ct.htable_size); 182 + return scale_hash(hash_conntrack_raw(tuple, net)); 186 183 } 187 184 188 185 bool ··· 481 478 begin: 482 479 do { 483 480 sequence = read_seqcount_begin(&nf_conntrack_generation); 484 - bucket = hash_bucket(hash, net); 485 - ct_hash = net->ct.hash; 481 + bucket = scale_hash(hash); 482 + ct_hash = nf_conntrack_hash; 486 483 } while (read_seqcount_retry(&nf_conntrack_generation, sequence)); 487 484 488 485 hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[bucket], hnnode) { ··· 546 543 unsigned int hash, 547 544 unsigned int reply_hash) 548 545 { 549 - struct net *net = nf_ct_net(ct); 550 - 551 546 hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, 552 - &net->ct.hash[hash]); 547 + &nf_conntrack_hash[hash]); 553 548 hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode, 554 - &net->ct.hash[reply_hash]); 549 + &nf_conntrack_hash[reply_hash]); 555 550 } 556 551 557 552 int ··· 574 573 } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); 575 574 576 575 /* See if there's one in the list already, including reverse */ 577 - hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode) 576 + hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) 578 577 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 579 578 zone, net)) 580 579 goto out; 581 580 582 - hlist_nulls_for_each_entry(h, n, &net->ct.hash[reply_hash], hnnode) 581 + hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) 583 582 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, 584 583 zone, net)) 585 584 goto out; ··· 634 633 sequence = read_seqcount_begin(&nf_conntrack_generation); 635 634 /* reuse the hash saved before */ 636 635 hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev; 637 - hash = hash_bucket(hash, net); 636 + hash = scale_hash(hash); 638 637 reply_hash = hash_conntrack(net, 639 638 &ct->tuplehash[IP_CT_DIR_REPLY].tuple); 640 639 ··· 664 663 /* See if there's one in the list already, including reverse: 665 664 NAT could have grabbed it without realizing, since we're 666 665 not in the hash. If there is, we lost race. */ 667 - hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode) 666 + hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) 668 667 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 669 668 zone, net)) 670 669 goto out; 671 670 672 - hlist_nulls_for_each_entry(h, n, &net->ct.hash[reply_hash], hnnode) 671 + hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) 673 672 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, 674 673 zone, net)) 675 674 goto out; ··· 737 736 do { 738 737 sequence = read_seqcount_begin(&nf_conntrack_generation); 739 738 hash = hash_conntrack(net, tuple); 740 - ct_hash = net->ct.hash; 739 + ct_hash = nf_conntrack_hash; 741 740 } while (read_seqcount_retry(&nf_conntrack_generation, sequence)); 742 741 743 742 hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[hash], hnnode) { ··· 774 773 local_bh_disable(); 775 774 restart: 776 775 sequence = read_seqcount_begin(&nf_conntrack_generation); 777 - hash = hash_bucket(_hash, net); 778 - for (; i < net->ct.htable_size; i++) { 776 + hash = scale_hash(_hash); 777 + for (; i < nf_conntrack_htable_size; i++) { 779 778 lockp = &nf_conntrack_locks[hash % CONNTRACK_LOCKS]; 780 779 nf_conntrack_lock(lockp); 781 780 if (read_seqcount_retry(&nf_conntrack_generation, sequence)) { 782 781 spin_unlock(lockp); 783 782 goto restart; 784 783 } 785 - hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash], 786 - hnnode) { 784 + hlist_nulls_for_each_entry_rcu(h, n, &nf_conntrack_hash[hash], 785 + hnnode) { 787 786 tmp = nf_ct_tuplehash_to_ctrack(h); 788 787 if (!test_bit(IPS_ASSURED_BIT, &tmp->status) && 789 788 !nf_ct_is_dying(tmp) && ··· 794 793 cnt++; 795 794 } 796 795 797 - hash = (hash + 1) % net->ct.htable_size; 796 + hash = (hash + 1) % nf_conntrack_htable_size; 798 797 spin_unlock(lockp); 799 798 800 799 if (ct || cnt >= NF_CT_EVICTION_RANGE) ··· 1377 1376 int cpu; 1378 1377 spinlock_t *lockp; 1379 1378 1380 - for (; *bucket < net->ct.htable_size; (*bucket)++) { 1379 + for (; *bucket < nf_conntrack_htable_size; (*bucket)++) { 1381 1380 lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS]; 1382 1381 local_bh_disable(); 1383 1382 nf_conntrack_lock(lockp); 1384 - if (*bucket < net->ct.htable_size) { 1385 - hlist_nulls_for_each_entry(h, n, &net->ct.hash[*bucket], hnnode) { 1383 + if (*bucket < nf_conntrack_htable_size) { 1384 + hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[*bucket], hnnode) { 1386 1385 if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL) 1387 1386 continue; 1388 1387 ct = nf_ct_tuplehash_to_ctrack(h); ··· 1479 1478 while (untrack_refs() > 0) 1480 1479 schedule(); 1481 1480 1481 + nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_htable_size); 1482 + 1482 1483 #ifdef CONFIG_NF_CONNTRACK_ZONES 1483 1484 nf_ct_extend_unregister(&nf_ct_zone_extend); 1484 1485 #endif ··· 1531 1528 } 1532 1529 1533 1530 list_for_each_entry(net, net_exit_list, exit_list) { 1534 - nf_ct_free_hashtable(net->ct.hash, net->ct.htable_size); 1535 1531 nf_conntrack_proto_pernet_fini(net); 1536 1532 nf_conntrack_helper_pernet_fini(net); 1537 1533 nf_conntrack_ecache_pernet_fini(net); ··· 1601 1599 * though since that required taking the locks. 1602 1600 */ 1603 1601 1604 - for (i = 0; i < init_net.ct.htable_size; i++) { 1605 - while (!hlist_nulls_empty(&init_net.ct.hash[i])) { 1606 - h = hlist_nulls_entry(init_net.ct.hash[i].first, 1607 - struct nf_conntrack_tuple_hash, hnnode); 1602 + for (i = 0; i < nf_conntrack_htable_size; i++) { 1603 + while (!hlist_nulls_empty(&nf_conntrack_hash[i])) { 1604 + h = hlist_nulls_entry(nf_conntrack_hash[i].first, 1605 + struct nf_conntrack_tuple_hash, hnnode); 1608 1606 ct = nf_ct_tuplehash_to_ctrack(h); 1609 1607 hlist_nulls_del_rcu(&h->hnnode); 1610 1608 bucket = __hash_conntrack(nf_ct_net(ct), ··· 1612 1610 hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]); 1613 1611 } 1614 1612 } 1615 - old_size = init_net.ct.htable_size; 1616 - old_hash = init_net.ct.hash; 1613 + old_size = nf_conntrack_htable_size; 1614 + old_hash = nf_conntrack_hash; 1617 1615 1618 - init_net.ct.htable_size = nf_conntrack_htable_size = hashsize; 1619 - init_net.ct.hash = hash; 1616 + nf_conntrack_hash = hash; 1617 + nf_conntrack_htable_size = hashsize; 1620 1618 1621 1619 write_seqcount_end(&nf_conntrack_generation); 1622 1620 nf_conntrack_all_unlock(); ··· 1672 1670 * entries. */ 1673 1671 max_factor = 4; 1674 1672 } 1673 + 1674 + nf_conntrack_hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, 1); 1675 + if (!nf_conntrack_hash) 1676 + return -ENOMEM; 1677 + 1675 1678 nf_conntrack_max = max_factor * nf_conntrack_htable_size; 1676 1679 1677 1680 printk(KERN_INFO "nf_conntrack version %s (%u buckets, %d max)\n", ··· 1755 1748 err_acct: 1756 1749 nf_conntrack_expect_fini(); 1757 1750 err_expect: 1751 + nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_htable_size); 1758 1752 return ret; 1759 1753 } 1760 1754 ··· 1808 1800 goto err_cache; 1809 1801 } 1810 1802 1811 - net->ct.htable_size = nf_conntrack_htable_size; 1812 - net->ct.hash = nf_ct_alloc_hashtable(&net->ct.htable_size, 1); 1813 - if (!net->ct.hash) { 1814 - printk(KERN_ERR "Unable to create nf_conntrack_hash\n"); 1815 - goto err_hash; 1816 - } 1817 1803 ret = nf_conntrack_expect_pernet_init(net); 1818 1804 if (ret < 0) 1819 1805 goto err_expect; ··· 1839 1837 err_acct: 1840 1838 nf_conntrack_expect_pernet_fini(net); 1841 1839 err_expect: 1842 - nf_ct_free_hashtable(net->ct.hash, net->ct.htable_size); 1843 - err_hash: 1844 1840 kmem_cache_destroy(net->ct.nf_conntrack_cachep); 1845 1841 err_cache: 1846 1842 kfree(net->ct.slabname);
+3 -3
net/netfilter/nf_conntrack_helper.c
··· 424 424 spin_unlock_bh(&pcpu->lock); 425 425 } 426 426 local_bh_disable(); 427 - for (i = 0; i < net->ct.htable_size; i++) { 427 + for (i = 0; i < nf_conntrack_htable_size; i++) { 428 428 nf_conntrack_lock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]); 429 - if (i < net->ct.htable_size) { 430 - hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode) 429 + if (i < nf_conntrack_htable_size) { 430 + hlist_nulls_for_each_entry(h, nn, &nf_conntrack_hash[i], hnnode) 431 431 unhelp(h, me); 432 432 } 433 433 spin_unlock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]);
+4 -4
net/netfilter/nf_conntrack_netlink.c
··· 824 824 last = (struct nf_conn *)cb->args[1]; 825 825 826 826 local_bh_disable(); 827 - for (; cb->args[0] < net->ct.htable_size; cb->args[0]++) { 827 + for (; cb->args[0] < nf_conntrack_htable_size; cb->args[0]++) { 828 828 restart: 829 829 lockp = &nf_conntrack_locks[cb->args[0] % CONNTRACK_LOCKS]; 830 830 nf_conntrack_lock(lockp); 831 - if (cb->args[0] >= net->ct.htable_size) { 831 + if (cb->args[0] >= nf_conntrack_htable_size) { 832 832 spin_unlock(lockp); 833 833 goto out; 834 834 } 835 - hlist_nulls_for_each_entry(h, n, &net->ct.hash[cb->args[0]], 836 - hnnode) { 835 + hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[cb->args[0]], 836 + hnnode) { 837 837 if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL) 838 838 continue; 839 839 ct = nf_ct_tuplehash_to_ctrack(h);
+5 -8
net/netfilter/nf_conntrack_standalone.c
··· 54 54 55 55 static struct hlist_nulls_node *ct_get_first(struct seq_file *seq) 56 56 { 57 - struct net *net = seq_file_net(seq); 58 57 struct ct_iter_state *st = seq->private; 59 58 struct hlist_nulls_node *n; 60 59 61 60 for (st->bucket = 0; 62 - st->bucket < net->ct.htable_size; 61 + st->bucket < nf_conntrack_htable_size; 63 62 st->bucket++) { 64 - n = rcu_dereference(hlist_nulls_first_rcu(&net->ct.hash[st->bucket])); 63 + n = rcu_dereference(hlist_nulls_first_rcu(&nf_conntrack_hash[st->bucket])); 65 64 if (!is_a_nulls(n)) 66 65 return n; 67 66 } ··· 70 71 static struct hlist_nulls_node *ct_get_next(struct seq_file *seq, 71 72 struct hlist_nulls_node *head) 72 73 { 73 - struct net *net = seq_file_net(seq); 74 74 struct ct_iter_state *st = seq->private; 75 75 76 76 head = rcu_dereference(hlist_nulls_next_rcu(head)); 77 77 while (is_a_nulls(head)) { 78 78 if (likely(get_nulls_value(head) == st->bucket)) { 79 - if (++st->bucket >= net->ct.htable_size) 79 + if (++st->bucket >= nf_conntrack_htable_size) 80 80 return NULL; 81 81 } 82 82 head = rcu_dereference( 83 83 hlist_nulls_first_rcu( 84 - &net->ct.hash[st->bucket])); 84 + &nf_conntrack_hash[st->bucket])); 85 85 } 86 86 return head; 87 87 } ··· 456 458 }, 457 459 { 458 460 .procname = "nf_conntrack_buckets", 459 - .data = &init_net.ct.htable_size, 461 + .data = &nf_conntrack_htable_size, 460 462 .maxlen = sizeof(unsigned int), 461 463 .mode = 0444, 462 464 .proc_handler = proc_dointvec, ··· 510 512 goto out_kmemdup; 511 513 512 514 table[1].data = &net->ct.count; 513 - table[2].data = &net->ct.htable_size; 514 515 table[3].data = &net->ct.sysctl_checksum; 515 516 table[4].data = &net->ct.sysctl_log_invalid; 516 517
+1 -1
net/netfilter/nf_nat_core.c
··· 824 824 static int __net_init nf_nat_net_init(struct net *net) 825 825 { 826 826 /* Leave them the same for the moment. */ 827 - net->ct.nat_htable_size = net->ct.htable_size; 827 + net->ct.nat_htable_size = nf_conntrack_htable_size; 828 828 net->ct.nat_bysource = nf_ct_alloc_hashtable(&net->ct.nat_htable_size, 0); 829 829 if (!net->ct.nat_bysource) 830 830 return -ENOMEM;