Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

netfilter: nat: convert nat bysrc hash to rhashtable

It did use a fixed-size bucket list plus single lock to protect add/del.

Unlike the main conntrack table we only need to add and remove keys.
Convert it to rhashtable to get table autosizing and per-bucket locking.

The maximum number of entries is -- as before -- tied to the number of
conntracks so we do not need another upperlimit.

The change does not handle rhashtable_remove_fast error, only possible
"error" is -ENOENT, and that is something that can happen legitimetely,
e.g. because nat module was inserted at a later time and no src manip
took place yet.

Tested with http-client-benchmark + httpterm with DNAT and SNAT rules
in place.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>

authored by

Florian Westphal and committed by
Pablo Neira Ayuso
870190a9 7c966435

+69 -57
+2 -1
include/net/netfilter/nf_conntrack.h
··· 17 17 #include <linux/bitops.h> 18 18 #include <linux/compiler.h> 19 19 #include <linux/atomic.h> 20 + #include <linux/rhashtable.h> 20 21 21 22 #include <linux/netfilter/nf_conntrack_tcp.h> 22 23 #include <linux/netfilter/nf_conntrack_dccp.h> ··· 119 118 struct nf_ct_ext *ext; 120 119 121 120 #if IS_ENABLED(CONFIG_NF_NAT) 122 - struct hlist_node nat_bysource; 121 + struct rhash_head nat_bysource; 123 122 #endif 124 123 /* Storage reserved for other modules, must be the last member */ 125 124 union nf_conntrack_proto proto;
+1
include/net/netfilter/nf_nat.h
··· 1 1 #ifndef _NF_NAT_H 2 2 #define _NF_NAT_H 3 + #include <linux/rhashtable.h> 3 4 #include <linux/netfilter_ipv4.h> 4 5 #include <linux/netfilter/nf_nat.h> 5 6 #include <net/netfilter/nf_conntrack_tuple.h>
+66 -56
net/netfilter/nf_nat_core.c
··· 30 30 #include <net/netfilter/nf_conntrack_zones.h> 31 31 #include <linux/netfilter/nf_nat.h> 32 32 33 - static DEFINE_SPINLOCK(nf_nat_lock); 34 - 35 33 static DEFINE_MUTEX(nf_nat_proto_mutex); 36 34 static const struct nf_nat_l3proto __rcu *nf_nat_l3protos[NFPROTO_NUMPROTO] 37 35 __read_mostly; 38 36 static const struct nf_nat_l4proto __rcu **nf_nat_l4protos[NFPROTO_NUMPROTO] 39 37 __read_mostly; 40 38 41 - static struct hlist_head *nf_nat_bysource __read_mostly; 42 - static unsigned int nf_nat_htable_size __read_mostly; 43 - static unsigned int nf_nat_hash_rnd __read_mostly; 39 + struct nf_nat_conn_key { 40 + const struct net *net; 41 + const struct nf_conntrack_tuple *tuple; 42 + const struct nf_conntrack_zone *zone; 43 + }; 44 + 45 + static struct rhashtable nf_nat_bysource_table; 44 46 45 47 inline const struct nf_nat_l3proto * 46 48 __nf_nat_l3proto_find(u8 family) ··· 121 119 EXPORT_SYMBOL(nf_xfrm_me_harder); 122 120 #endif /* CONFIG_XFRM */ 123 121 124 - /* We keep an extra hash for each conntrack, for fast searching. */ 125 - static inline unsigned int 126 - hash_by_src(const struct net *n, const struct nf_conntrack_tuple *tuple) 122 + static u32 nf_nat_bysource_hash(const void *data, u32 len, u32 seed) 127 123 { 128 - unsigned int hash; 124 + const struct nf_conntrack_tuple *t; 125 + const struct nf_conn *ct = data; 129 126 130 - get_random_once(&nf_nat_hash_rnd, sizeof(nf_nat_hash_rnd)); 131 - 127 + t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; 132 128 /* Original src, to ensure we map it consistently if poss. */ 133 - hash = jhash2((u32 *)&tuple->src, sizeof(tuple->src) / sizeof(u32), 134 - tuple->dst.protonum ^ nf_nat_hash_rnd ^ net_hash_mix(n)); 135 129 136 - return reciprocal_scale(hash, nf_nat_htable_size); 130 + seed ^= net_hash_mix(nf_ct_net(ct)); 131 + return jhash2((const u32 *)&t->src, sizeof(t->src) / sizeof(u32), 132 + t->dst.protonum ^ seed); 137 133 } 138 134 139 135 /* Is this tuple already taken? (not by us) */ ··· 187 187 t->src.u.all == tuple->src.u.all); 188 188 } 189 189 190 + static int nf_nat_bysource_cmp(struct rhashtable_compare_arg *arg, 191 + const void *obj) 192 + { 193 + const struct nf_nat_conn_key *key = arg->key; 194 + const struct nf_conn *ct = obj; 195 + 196 + return same_src(ct, key->tuple) && 197 + net_eq(nf_ct_net(ct), key->net) && 198 + nf_ct_zone_equal(ct, key->zone, IP_CT_DIR_ORIGINAL); 199 + } 200 + 201 + static struct rhashtable_params nf_nat_bysource_params = { 202 + .head_offset = offsetof(struct nf_conn, nat_bysource), 203 + .obj_hashfn = nf_nat_bysource_hash, 204 + .obj_cmpfn = nf_nat_bysource_cmp, 205 + .nelem_hint = 256, 206 + .min_size = 1024, 207 + .nulls_base = (1U << RHT_BASE_SHIFT), 208 + }; 209 + 190 210 /* Only called for SRC manip */ 191 211 static int 192 212 find_appropriate_src(struct net *net, ··· 217 197 struct nf_conntrack_tuple *result, 218 198 const struct nf_nat_range *range) 219 199 { 220 - unsigned int h = hash_by_src(net, tuple); 221 200 const struct nf_conn *ct; 201 + struct nf_nat_conn_key key = { 202 + .net = net, 203 + .tuple = tuple, 204 + .zone = zone 205 + }; 222 206 223 - hlist_for_each_entry_rcu(ct, &nf_nat_bysource[h], nat_bysource) { 224 - if (same_src(ct, tuple) && 225 - net_eq(net, nf_ct_net(ct)) && 226 - nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) { 227 - /* Copy source part from reply tuple. */ 228 - nf_ct_invert_tuplepr(result, 229 - &ct->tuplehash[IP_CT_DIR_REPLY].tuple); 230 - result->dst = tuple->dst; 207 + ct = rhashtable_lookup_fast(&nf_nat_bysource_table, &key, 208 + nf_nat_bysource_params); 209 + if (!ct) 210 + return 0; 231 211 232 - if (in_range(l3proto, l4proto, result, range)) 233 - return 1; 234 - } 235 - } 236 - return 0; 212 + nf_ct_invert_tuplepr(result, 213 + &ct->tuplehash[IP_CT_DIR_REPLY].tuple); 214 + result->dst = tuple->dst; 215 + 216 + return in_range(l3proto, l4proto, result, range); 237 217 } 238 218 239 219 /* For [FUTURE] fragmentation handling, we want the least-used ··· 405 385 const struct nf_nat_range *range, 406 386 enum nf_nat_manip_type maniptype) 407 387 { 408 - struct net *net = nf_ct_net(ct); 409 388 struct nf_conntrack_tuple curr_tuple, new_tuple; 410 389 struct nf_conn_nat *nat; 411 390 ··· 445 426 } 446 427 447 428 if (maniptype == NF_NAT_MANIP_SRC) { 448 - unsigned int srchash; 429 + int err; 449 430 450 - srchash = hash_by_src(net, 451 - &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); 452 - spin_lock_bh(&nf_nat_lock); 453 - /* nf_conntrack_alter_reply might re-allocate extension aera */ 454 - nat = nfct_nat(ct); 455 - hlist_add_head_rcu(&ct->nat_bysource, 456 - &nf_nat_bysource[srchash]); 457 - spin_unlock_bh(&nf_nat_lock); 431 + err = rhashtable_insert_fast(&nf_nat_bysource_table, 432 + &ct->nat_bysource, 433 + nf_nat_bysource_params); 434 + if (err) 435 + return NF_DROP; 458 436 } 459 437 460 438 /* It's done. */ ··· 568 552 if (!del_timer(&ct->timeout)) 569 553 return 1; 570 554 571 - spin_lock_bh(&nf_nat_lock); 572 - hlist_del_rcu(&ct->nat_bysource); 573 555 ct->status &= ~IPS_NAT_DONE_MASK; 574 - spin_unlock_bh(&nf_nat_lock); 556 + 557 + rhashtable_remove_fast(&nf_nat_bysource_table, &ct->nat_bysource, 558 + nf_nat_bysource_params); 575 559 576 560 add_timer(&ct->timeout); 577 561 ··· 703 687 if (!nat) 704 688 return; 705 689 706 - NF_CT_ASSERT(ct->status & IPS_SRC_NAT_DONE); 707 - 708 - spin_lock_bh(&nf_nat_lock); 709 - hlist_del_rcu(&ct->nat_bysource); 710 - spin_unlock_bh(&nf_nat_lock); 690 + rhashtable_remove_fast(&nf_nat_bysource_table, &ct->nat_bysource, 691 + nf_nat_bysource_params); 711 692 } 712 693 713 694 static struct nf_ct_ext_type nat_extend __read_mostly = { ··· 839 826 { 840 827 int ret; 841 828 842 - /* Leave them the same for the moment. */ 843 - nf_nat_htable_size = nf_conntrack_htable_size; 844 - 845 - nf_nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, 0); 846 - if (!nf_nat_bysource) 847 - return -ENOMEM; 829 + ret = rhashtable_init(&nf_nat_bysource_table, &nf_nat_bysource_params); 830 + if (ret) 831 + return ret; 848 832 849 833 ret = nf_ct_extend_register(&nat_extend); 850 834 if (ret < 0) { 851 - nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size); 835 + rhashtable_destroy(&nf_nat_bysource_table); 852 836 printk(KERN_ERR "nf_nat_core: Unable to register extension\n"); 853 837 return ret; 854 838 } ··· 869 859 return 0; 870 860 871 861 cleanup_extend: 872 - nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size); 862 + rhashtable_destroy(&nf_nat_bysource_table); 873 863 nf_ct_extend_unregister(&nat_extend); 874 864 return ret; 875 865 } ··· 887 877 #endif 888 878 for (i = 0; i < NFPROTO_NUMPROTO; i++) 889 879 kfree(nf_nat_l4protos[i]); 890 - synchronize_net(); 891 - nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size); 880 + 881 + rhashtable_destroy(&nf_nat_bysource_table); 892 882 } 893 883 894 884 MODULE_LICENSE("GPL");