Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

netfilter: nf_conntrack: use SLAB_DESTROY_BY_RCU and get rid of call_rcu()

Use "hlist_nulls" infrastructure we added in 2.6.29 for RCUification of UDP & TCP.

This permits an easy conversion from call_rcu() based hash lists to a
SLAB_DESTROY_BY_RCU one.

Avoiding call_rcu() delay at nf_conn freeing time has numerous gains.

First, it doesnt fill RCU queues (up to 10000 elements per cpu).
This reduces OOM possibility, if queued elements are not taken into account
This reduces latency problems when RCU queue size hits hilimit and triggers
emergency mode.

- It allows fast reuse of just freed elements, permitting better use of
CPU cache.

- We delete rcu_head from "struct nf_conn", shrinking size of this structure
by 8 or 16 bytes.

This patch only takes care of "struct nf_conn".
call_rcu() is still used for less critical conntrack parts, that may
be converted later if necessary.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>

authored by

Eric Dumazet and committed by
Patrick McHardy
ea781f19 1f9352ae

+174 -131
+8 -6
include/net/netfilter/nf_conntrack.h
··· 91 91 #include <net/netfilter/ipv4/nf_conntrack_ipv4.h> 92 92 #include <net/netfilter/ipv6/nf_conntrack_ipv6.h> 93 93 94 - struct nf_conn 95 - { 94 + struct nf_conn { 96 95 /* Usage count in here is 1 for hash table/destruct timer, 1 per skb, 97 96 plus 1 for any connection(s) we are `master' for */ 98 97 struct nf_conntrack ct_general; ··· 125 126 #ifdef CONFIG_NET_NS 126 127 struct net *ct_net; 127 128 #endif 128 - struct rcu_head rcu; 129 129 }; 130 130 131 131 static inline struct nf_conn * ··· 188 190 extern int nf_ct_l3proto_try_module_get(unsigned short l3proto); 189 191 extern void nf_ct_l3proto_module_put(unsigned short l3proto); 190 192 191 - extern struct hlist_head *nf_ct_alloc_hashtable(unsigned int *sizep, int *vmalloced); 192 - extern void nf_ct_free_hashtable(struct hlist_head *hash, int vmalloced, 193 - unsigned int size); 193 + /* 194 + * Allocate a hashtable of hlist_head (if nulls == 0), 195 + * or hlist_nulls_head (if nulls == 1) 196 + */ 197 + extern void *nf_ct_alloc_hashtable(unsigned int *sizep, int *vmalloced, int nulls); 198 + 199 + extern void nf_ct_free_hashtable(void *hash, int vmalloced, unsigned int size); 194 200 195 201 extern struct nf_conntrack_tuple_hash * 196 202 __nf_conntrack_find(struct net *net, const struct nf_conntrack_tuple *tuple);
+3 -3
include/net/netfilter/nf_conntrack_tuple.h
··· 12 12 13 13 #include <linux/netfilter/x_tables.h> 14 14 #include <linux/netfilter/nf_conntrack_tuple_common.h> 15 + #include <linux/list_nulls.h> 15 16 16 17 /* A `tuple' is a structure containing the information to uniquely 17 18 identify a connection. ie. if two packets have the same tuple, they ··· 147 146 ((enum ip_conntrack_dir)(h)->tuple.dst.dir) 148 147 149 148 /* Connections have two entries in the hash table: one for each way */ 150 - struct nf_conntrack_tuple_hash 151 - { 152 - struct hlist_node hnode; 149 + struct nf_conntrack_tuple_hash { 150 + struct hlist_nulls_node hnnode; 153 151 struct nf_conntrack_tuple tuple; 154 152 }; 155 153
+3 -2
include/net/netns/conntrack.h
··· 2 2 #define __NETNS_CONNTRACK_H 3 3 4 4 #include <linux/list.h> 5 + #include <linux/list_nulls.h> 5 6 #include <asm/atomic.h> 6 7 7 8 struct ctl_table_header; ··· 11 10 struct netns_ct { 12 11 atomic_t count; 13 12 unsigned int expect_count; 14 - struct hlist_head *hash; 13 + struct hlist_nulls_head *hash; 15 14 struct hlist_head *expect_hash; 16 - struct hlist_head unconfirmed; 15 + struct hlist_nulls_head unconfirmed; 17 16 struct ip_conntrack_stat *stat; 18 17 #ifdef CONFIG_NF_CONNTRACK_EVENTS 19 18 struct nf_conntrack_ecache *ecache;
+36 -27
net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
··· 25 25 unsigned int bucket; 26 26 }; 27 27 28 - static struct hlist_node *ct_get_first(struct seq_file *seq) 28 + static struct hlist_nulls_node *ct_get_first(struct seq_file *seq) 29 29 { 30 30 struct net *net = seq_file_net(seq); 31 31 struct ct_iter_state *st = seq->private; 32 - struct hlist_node *n; 32 + struct hlist_nulls_node *n; 33 33 34 34 for (st->bucket = 0; 35 35 st->bucket < nf_conntrack_htable_size; 36 36 st->bucket++) { 37 37 n = rcu_dereference(net->ct.hash[st->bucket].first); 38 - if (n) 38 + if (!is_a_nulls(n)) 39 39 return n; 40 40 } 41 41 return NULL; 42 42 } 43 43 44 - static struct hlist_node *ct_get_next(struct seq_file *seq, 45 - struct hlist_node *head) 44 + static struct hlist_nulls_node *ct_get_next(struct seq_file *seq, 45 + struct hlist_nulls_node *head) 46 46 { 47 47 struct net *net = seq_file_net(seq); 48 48 struct ct_iter_state *st = seq->private; 49 49 50 50 head = rcu_dereference(head->next); 51 - while (head == NULL) { 52 - if (++st->bucket >= nf_conntrack_htable_size) 53 - return NULL; 51 + while (is_a_nulls(head)) { 52 + if (likely(get_nulls_value(head) == st->bucket)) { 53 + if (++st->bucket >= nf_conntrack_htable_size) 54 + return NULL; 55 + } 54 56 head = rcu_dereference(net->ct.hash[st->bucket].first); 55 57 } 56 58 return head; 57 59 } 58 60 59 - static struct hlist_node *ct_get_idx(struct seq_file *seq, loff_t pos) 61 + static struct hlist_nulls_node *ct_get_idx(struct seq_file *seq, loff_t pos) 60 62 { 61 - struct hlist_node *head = ct_get_first(seq); 63 + struct hlist_nulls_node *head = ct_get_first(seq); 62 64 63 65 if (head) 64 66 while (pos && (head = ct_get_next(seq, head))) ··· 89 87 90 88 static int ct_seq_show(struct seq_file *s, void *v) 91 89 { 92 - const struct nf_conntrack_tuple_hash *hash = v; 93 - const struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(hash); 90 + struct nf_conntrack_tuple_hash *hash = v; 91 + struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(hash); 94 92 const struct nf_conntrack_l3proto *l3proto; 95 93 const struct nf_conntrack_l4proto *l4proto; 94 + int ret = 0; 96 95 97 96 NF_CT_ASSERT(ct); 97 + if (unlikely(!atomic_inc_not_zero(&ct->ct_general.use))) 98 + return 0; 99 + 98 100 99 101 /* we only want to print DIR_ORIGINAL */ 100 102 if (NF_CT_DIRECTION(hash)) 101 - return 0; 103 + goto release; 102 104 if (nf_ct_l3num(ct) != AF_INET) 103 - return 0; 105 + goto release; 104 106 105 107 l3proto = __nf_ct_l3proto_find(nf_ct_l3num(ct)); 106 108 NF_CT_ASSERT(l3proto); 107 109 l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); 108 110 NF_CT_ASSERT(l4proto); 109 111 112 + ret = -ENOSPC; 110 113 if (seq_printf(s, "%-8s %u %ld ", 111 114 l4proto->name, nf_ct_protonum(ct), 112 115 timer_pending(&ct->timeout) 113 116 ? (long)(ct->timeout.expires - jiffies)/HZ : 0) != 0) 114 - return -ENOSPC; 117 + goto release; 115 118 116 119 if (l4proto->print_conntrack && l4proto->print_conntrack(s, ct)) 117 - return -ENOSPC; 120 + goto release; 118 121 119 122 if (print_tuple(s, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 120 123 l3proto, l4proto)) 121 - return -ENOSPC; 124 + goto release; 122 125 123 126 if (seq_print_acct(s, ct, IP_CT_DIR_ORIGINAL)) 124 - return -ENOSPC; 127 + goto release; 125 128 126 129 if (!(test_bit(IPS_SEEN_REPLY_BIT, &ct->status))) 127 130 if (seq_printf(s, "[UNREPLIED] ")) 128 - return -ENOSPC; 131 + goto release; 129 132 130 133 if (print_tuple(s, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, 131 134 l3proto, l4proto)) 132 - return -ENOSPC; 135 + goto release; 133 136 134 137 if (seq_print_acct(s, ct, IP_CT_DIR_REPLY)) 135 - return -ENOSPC; 138 + goto release; 136 139 137 140 if (test_bit(IPS_ASSURED_BIT, &ct->status)) 138 141 if (seq_printf(s, "[ASSURED] ")) 139 - return -ENOSPC; 142 + goto release; 140 143 141 144 #ifdef CONFIG_NF_CONNTRACK_MARK 142 145 if (seq_printf(s, "mark=%u ", ct->mark)) 143 - return -ENOSPC; 146 + goto release; 144 147 #endif 145 148 146 149 #ifdef CONFIG_NF_CONNTRACK_SECMARK 147 150 if (seq_printf(s, "secmark=%u ", ct->secmark)) 148 - return -ENOSPC; 151 + goto release; 149 152 #endif 150 153 151 154 if (seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use))) 152 - return -ENOSPC; 153 - 154 - return 0; 155 + goto release; 156 + ret = 0; 157 + release: 158 + nf_ct_put(ct); 159 + return ret; 155 160 } 156 161 157 162 static const struct seq_operations ct_seq_ops = {
+1 -1
net/ipv4/netfilter/nf_nat_core.c
··· 679 679 static int __net_init nf_nat_net_init(struct net *net) 680 680 { 681 681 net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, 682 - &net->ipv4.nat_vmalloced); 682 + &net->ipv4.nat_vmalloced, 0); 683 683 if (!net->ipv4.nat_bysource) 684 684 return -ENOMEM; 685 685 return 0;
+70 -53
net/netfilter/nf_conntrack_core.c
··· 29 29 #include <linux/netdevice.h> 30 30 #include <linux/socket.h> 31 31 #include <linux/mm.h> 32 + #include <linux/rculist_nulls.h> 32 33 33 34 #include <net/netfilter/nf_conntrack.h> 34 35 #include <net/netfilter/nf_conntrack_l3proto.h> ··· 164 163 clean_from_lists(struct nf_conn *ct) 165 164 { 166 165 pr_debug("clean_from_lists(%p)\n", ct); 167 - hlist_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnode); 168 - hlist_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnode); 166 + hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); 167 + hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode); 169 168 170 169 /* Destroy all pending expectations */ 171 170 nf_ct_remove_expectations(ct); ··· 205 204 206 205 /* We overload first tuple to link into unconfirmed list. */ 207 206 if (!nf_ct_is_confirmed(ct)) { 208 - BUG_ON(hlist_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnode)); 209 - hlist_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnode); 207 + BUG_ON(hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode)); 208 + hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); 210 209 } 211 210 212 211 NF_CT_STAT_INC(net, delete); ··· 243 242 nf_ct_put(ct); 244 243 } 245 244 245 + /* 246 + * Warning : 247 + * - Caller must take a reference on returned object 248 + * and recheck nf_ct_tuple_equal(tuple, &h->tuple) 249 + * OR 250 + * - Caller must lock nf_conntrack_lock before calling this function 251 + */ 246 252 struct nf_conntrack_tuple_hash * 247 253 __nf_conntrack_find(struct net *net, const struct nf_conntrack_tuple *tuple) 248 254 { 249 255 struct nf_conntrack_tuple_hash *h; 250 - struct hlist_node *n; 256 + struct hlist_nulls_node *n; 251 257 unsigned int hash = hash_conntrack(tuple); 252 258 253 259 /* Disable BHs the entire time since we normally need to disable them 254 260 * at least once for the stats anyway. 255 261 */ 256 262 local_bh_disable(); 257 - hlist_for_each_entry_rcu(h, n, &net->ct.hash[hash], hnode) { 263 + begin: 264 + hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash], hnnode) { 258 265 if (nf_ct_tuple_equal(tuple, &h->tuple)) { 259 266 NF_CT_STAT_INC(net, found); 260 267 local_bh_enable(); ··· 270 261 } 271 262 NF_CT_STAT_INC(net, searched); 272 263 } 264 + /* 265 + * if the nulls value we got at the end of this lookup is 266 + * not the expected one, we must restart lookup. 267 + * We probably met an item that was moved to another chain. 268 + */ 269 + if (get_nulls_value(n) != hash) 270 + goto begin; 273 271 local_bh_enable(); 274 272 275 273 return NULL; ··· 291 275 struct nf_conn *ct; 292 276 293 277 rcu_read_lock(); 278 + begin: 294 279 h = __nf_conntrack_find(net, tuple); 295 280 if (h) { 296 281 ct = nf_ct_tuplehash_to_ctrack(h); 297 282 if (unlikely(!atomic_inc_not_zero(&ct->ct_general.use))) 298 283 h = NULL; 284 + else { 285 + if (unlikely(!nf_ct_tuple_equal(tuple, &h->tuple))) { 286 + nf_ct_put(ct); 287 + goto begin; 288 + } 289 + } 299 290 } 300 291 rcu_read_unlock(); 301 292 ··· 316 293 { 317 294 struct net *net = nf_ct_net(ct); 318 295 319 - hlist_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnode, 296 + hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, 320 297 &net->ct.hash[hash]); 321 - hlist_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnode, 298 + hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode, 322 299 &net->ct.hash[repl_hash]); 323 300 } 324 301 ··· 341 318 struct nf_conntrack_tuple_hash *h; 342 319 struct nf_conn *ct; 343 320 struct nf_conn_help *help; 344 - struct hlist_node *n; 321 + struct hlist_nulls_node *n; 345 322 enum ip_conntrack_info ctinfo; 346 323 struct net *net; 347 324 ··· 373 350 /* See if there's one in the list already, including reverse: 374 351 NAT could have grabbed it without realizing, since we're 375 352 not in the hash. If there is, we lost race. */ 376 - hlist_for_each_entry(h, n, &net->ct.hash[hash], hnode) 353 + hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode) 377 354 if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 378 355 &h->tuple)) 379 356 goto out; 380 - hlist_for_each_entry(h, n, &net->ct.hash[repl_hash], hnode) 357 + hlist_nulls_for_each_entry(h, n, &net->ct.hash[repl_hash], hnnode) 381 358 if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple, 382 359 &h->tuple)) 383 360 goto out; 384 361 385 362 /* Remove from unconfirmed list */ 386 - hlist_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnode); 363 + hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); 387 364 388 365 __nf_conntrack_hash_insert(ct, hash, repl_hash); 389 366 /* Timer relative to confirmation time, not original ··· 422 399 { 423 400 struct net *net = nf_ct_net(ignored_conntrack); 424 401 struct nf_conntrack_tuple_hash *h; 425 - struct hlist_node *n; 402 + struct hlist_nulls_node *n; 426 403 unsigned int hash = hash_conntrack(tuple); 427 404 428 405 /* Disable BHs the entire time since we need to disable them at 429 406 * least once for the stats anyway. 430 407 */ 431 408 rcu_read_lock_bh(); 432 - hlist_for_each_entry_rcu(h, n, &net->ct.hash[hash], hnode) { 409 + hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash], hnnode) { 433 410 if (nf_ct_tuplehash_to_ctrack(h) != ignored_conntrack && 434 411 nf_ct_tuple_equal(tuple, &h->tuple)) { 435 412 NF_CT_STAT_INC(net, found); ··· 453 430 /* Use oldest entry, which is roughly LRU */ 454 431 struct nf_conntrack_tuple_hash *h; 455 432 struct nf_conn *ct = NULL, *tmp; 456 - struct hlist_node *n; 433 + struct hlist_nulls_node *n; 457 434 unsigned int i, cnt = 0; 458 435 int dropped = 0; 459 436 460 437 rcu_read_lock(); 461 438 for (i = 0; i < nf_conntrack_htable_size; i++) { 462 - hlist_for_each_entry_rcu(h, n, &net->ct.hash[hash], 463 - hnode) { 439 + hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash], 440 + hnnode) { 464 441 tmp = nf_ct_tuplehash_to_ctrack(h); 465 442 if (!test_bit(IPS_ASSURED_BIT, &tmp->status)) 466 443 ct = tmp; ··· 531 508 #ifdef CONFIG_NET_NS 532 509 ct->ct_net = net; 533 510 #endif 534 - INIT_RCU_HEAD(&ct->rcu); 535 511 536 512 return ct; 537 513 } 538 514 EXPORT_SYMBOL_GPL(nf_conntrack_alloc); 539 - 540 - static void nf_conntrack_free_rcu(struct rcu_head *head) 541 - { 542 - struct nf_conn *ct = container_of(head, struct nf_conn, rcu); 543 - 544 - nf_ct_ext_free(ct); 545 - kmem_cache_free(nf_conntrack_cachep, ct); 546 - } 547 515 548 516 void nf_conntrack_free(struct nf_conn *ct) 549 517 { ··· 542 528 543 529 nf_ct_ext_destroy(ct); 544 530 atomic_dec(&net->ct.count); 545 - call_rcu(&ct->rcu, nf_conntrack_free_rcu); 531 + nf_ct_ext_free(ct); 532 + kmem_cache_free(nf_conntrack_cachep, ct); 546 533 } 547 534 EXPORT_SYMBOL_GPL(nf_conntrack_free); 548 535 ··· 609 594 } 610 595 611 596 /* Overload tuple linked list to put us in unconfirmed list. */ 612 - hlist_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnode, 597 + hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, 613 598 &net->ct.unconfirmed); 614 599 615 600 spin_unlock_bh(&nf_conntrack_lock); ··· 949 934 { 950 935 struct nf_conntrack_tuple_hash *h; 951 936 struct nf_conn *ct; 952 - struct hlist_node *n; 937 + struct hlist_nulls_node *n; 953 938 954 939 spin_lock_bh(&nf_conntrack_lock); 955 940 for (; *bucket < nf_conntrack_htable_size; (*bucket)++) { 956 - hlist_for_each_entry(h, n, &net->ct.hash[*bucket], hnode) { 941 + hlist_nulls_for_each_entry(h, n, &net->ct.hash[*bucket], hnnode) { 957 942 ct = nf_ct_tuplehash_to_ctrack(h); 958 943 if (iter(ct, data)) 959 944 goto found; 960 945 } 961 946 } 962 - hlist_for_each_entry(h, n, &net->ct.unconfirmed, hnode) { 947 + hlist_nulls_for_each_entry(h, n, &net->ct.unconfirmed, hnnode) { 963 948 ct = nf_ct_tuplehash_to_ctrack(h); 964 949 if (iter(ct, data)) 965 950 set_bit(IPS_DYING_BIT, &ct->status); ··· 1007 992 return 1; 1008 993 } 1009 994 1010 - void nf_ct_free_hashtable(struct hlist_head *hash, int vmalloced, unsigned int size) 995 + void nf_ct_free_hashtable(void *hash, int vmalloced, unsigned int size) 1011 996 { 1012 997 if (vmalloced) 1013 998 vfree(hash); ··· 1075 1060 } 1076 1061 } 1077 1062 1078 - struct hlist_head *nf_ct_alloc_hashtable(unsigned int *sizep, int *vmalloced) 1063 + void *nf_ct_alloc_hashtable(unsigned int *sizep, int *vmalloced, int nulls) 1079 1064 { 1080 - struct hlist_head *hash; 1081 - unsigned int size, i; 1065 + struct hlist_nulls_head *hash; 1066 + unsigned int nr_slots, i; 1067 + size_t sz; 1082 1068 1083 1069 *vmalloced = 0; 1084 1070 1085 - size = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_head)); 1086 - hash = (void*)__get_free_pages(GFP_KERNEL|__GFP_NOWARN, 1087 - get_order(sizeof(struct hlist_head) 1088 - * size)); 1071 + BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head)); 1072 + nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head)); 1073 + sz = nr_slots * sizeof(struct hlist_nulls_head); 1074 + hash = (void *)__get_free_pages(GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO, 1075 + get_order(sz)); 1089 1076 if (!hash) { 1090 1077 *vmalloced = 1; 1091 1078 printk(KERN_WARNING "nf_conntrack: falling back to vmalloc.\n"); 1092 - hash = vmalloc(sizeof(struct hlist_head) * size); 1079 + hash = __vmalloc(sz, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL); 1093 1080 } 1094 1081 1095 - if (hash) 1096 - for (i = 0; i < size; i++) 1097 - INIT_HLIST_HEAD(&hash[i]); 1082 + if (hash && nulls) 1083 + for (i = 0; i < nr_slots; i++) 1084 + INIT_HLIST_NULLS_HEAD(&hash[i], i); 1098 1085 1099 1086 return hash; 1100 1087 } ··· 1107 1090 int i, bucket, vmalloced, old_vmalloced; 1108 1091 unsigned int hashsize, old_size; 1109 1092 int rnd; 1110 - struct hlist_head *hash, *old_hash; 1093 + struct hlist_nulls_head *hash, *old_hash; 1111 1094 struct nf_conntrack_tuple_hash *h; 1112 1095 1113 1096 /* On boot, we can set this without any fancy locking. */ ··· 1118 1101 if (!hashsize) 1119 1102 return -EINVAL; 1120 1103 1121 - hash = nf_ct_alloc_hashtable(&hashsize, &vmalloced); 1104 + hash = nf_ct_alloc_hashtable(&hashsize, &vmalloced, 1); 1122 1105 if (!hash) 1123 1106 return -ENOMEM; 1124 1107 ··· 1133 1116 */ 1134 1117 spin_lock_bh(&nf_conntrack_lock); 1135 1118 for (i = 0; i < nf_conntrack_htable_size; i++) { 1136 - while (!hlist_empty(&init_net.ct.hash[i])) { 1137 - h = hlist_entry(init_net.ct.hash[i].first, 1138 - struct nf_conntrack_tuple_hash, hnode); 1139 - hlist_del_rcu(&h->hnode); 1119 + while (!hlist_nulls_empty(&init_net.ct.hash[i])) { 1120 + h = hlist_nulls_entry(init_net.ct.hash[i].first, 1121 + struct nf_conntrack_tuple_hash, hnnode); 1122 + hlist_nulls_del_rcu(&h->hnnode); 1140 1123 bucket = __hash_conntrack(&h->tuple, hashsize, rnd); 1141 - hlist_add_head_rcu(&h->hnode, &hash[bucket]); 1124 + hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]); 1142 1125 } 1143 1126 } 1144 1127 old_size = nf_conntrack_htable_size; ··· 1189 1172 1190 1173 nf_conntrack_cachep = kmem_cache_create("nf_conntrack", 1191 1174 sizeof(struct nf_conn), 1192 - 0, 0, NULL); 1175 + 0, SLAB_DESTROY_BY_RCU, NULL); 1193 1176 if (!nf_conntrack_cachep) { 1194 1177 printk(KERN_ERR "Unable to create nf_conn slab cache\n"); 1195 1178 ret = -ENOMEM; ··· 1219 1202 int ret; 1220 1203 1221 1204 atomic_set(&net->ct.count, 0); 1222 - INIT_HLIST_HEAD(&net->ct.unconfirmed); 1205 + INIT_HLIST_NULLS_HEAD(&net->ct.unconfirmed, 0); 1223 1206 net->ct.stat = alloc_percpu(struct ip_conntrack_stat); 1224 1207 if (!net->ct.stat) { 1225 1208 ret = -ENOMEM; ··· 1229 1212 if (ret < 0) 1230 1213 goto err_ecache; 1231 1214 net->ct.hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, 1232 - &net->ct.hash_vmalloc); 1215 + &net->ct.hash_vmalloc, 1); 1233 1216 if (!net->ct.hash) { 1234 1217 ret = -ENOMEM; 1235 1218 printk(KERN_ERR "Unable to create nf_conntrack_hash\n");
+1 -1
net/netfilter/nf_conntrack_expect.c
··· 604 604 605 605 net->ct.expect_count = 0; 606 606 net->ct.expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize, 607 - &net->ct.expect_vmalloc); 607 + &net->ct.expect_vmalloc, 0); 608 608 if (net->ct.expect_hash == NULL) 609 609 goto err1; 610 610
+4 -3
net/netfilter/nf_conntrack_helper.c
··· 159 159 struct nf_conntrack_tuple_hash *h; 160 160 struct nf_conntrack_expect *exp; 161 161 const struct hlist_node *n, *next; 162 + const struct hlist_nulls_node *nn; 162 163 unsigned int i; 163 164 164 165 /* Get rid of expectations */ ··· 176 175 } 177 176 178 177 /* Get rid of expecteds, set helpers to NULL. */ 179 - hlist_for_each_entry(h, n, &net->ct.unconfirmed, hnode) 178 + hlist_for_each_entry(h, nn, &net->ct.unconfirmed, hnnode) 180 179 unhelp(h, me); 181 180 for (i = 0; i < nf_conntrack_htable_size; i++) { 182 - hlist_for_each_entry(h, n, &net->ct.hash[i], hnode) 181 + hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode) 183 182 unhelp(h, me); 184 183 } 185 184 } ··· 219 218 220 219 nf_ct_helper_hsize = 1; /* gets rounded up to use one page */ 221 220 nf_ct_helper_hash = nf_ct_alloc_hashtable(&nf_ct_helper_hsize, 222 - &nf_ct_helper_vmalloc); 221 + &nf_ct_helper_vmalloc, 0); 223 222 if (!nf_ct_helper_hash) 224 223 return -ENOMEM; 225 224
+11 -9
net/netfilter/nf_conntrack_netlink.c
··· 19 19 #include <linux/module.h> 20 20 #include <linux/kernel.h> 21 21 #include <linux/rculist.h> 22 + #include <linux/rculist_nulls.h> 22 23 #include <linux/types.h> 23 24 #include <linux/timer.h> 24 25 #include <linux/skbuff.h> ··· 537 536 { 538 537 struct nf_conn *ct, *last; 539 538 struct nf_conntrack_tuple_hash *h; 540 - struct hlist_node *n; 539 + struct hlist_nulls_node *n; 541 540 struct nfgenmsg *nfmsg = NLMSG_DATA(cb->nlh); 542 541 u_int8_t l3proto = nfmsg->nfgen_family; 543 542 ··· 545 544 last = (struct nf_conn *)cb->args[1]; 546 545 for (; cb->args[0] < nf_conntrack_htable_size; cb->args[0]++) { 547 546 restart: 548 - hlist_for_each_entry_rcu(h, n, &init_net.ct.hash[cb->args[0]], 549 - hnode) { 547 + hlist_nulls_for_each_entry_rcu(h, n, &init_net.ct.hash[cb->args[0]], 548 + hnnode) { 550 549 if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL) 551 550 continue; 552 551 ct = nf_ct_tuplehash_to_ctrack(h); 552 + if (!atomic_inc_not_zero(&ct->ct_general.use)) 553 + continue; 553 554 /* Dump entries of a given L3 protocol number. 554 555 * If it is not specified, ie. l3proto == 0, 555 556 * then dump everything. */ 556 557 if (l3proto && nf_ct_l3num(ct) != l3proto) 557 - continue; 558 + goto releasect; 558 559 if (cb->args[1]) { 559 560 if (ct != last) 560 - continue; 561 + goto releasect; 561 562 cb->args[1] = 0; 562 563 } 563 564 if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid, 564 565 cb->nlh->nlmsg_seq, 565 566 IPCTNL_MSG_CT_NEW, 566 567 1, ct) < 0) { 567 - if (!atomic_inc_not_zero(&ct->ct_general.use)) 568 - continue; 569 568 cb->args[1] = (unsigned long)ct; 570 569 goto out; 571 570 } ··· 578 577 if (acct) 579 578 memset(acct, 0, sizeof(struct nf_conn_counter[IP_CT_DIR_MAX])); 580 579 } 580 + releasect: 581 + nf_ct_put(ct); 581 582 } 582 583 if (cb->args[1]) { 583 584 cb->args[1] = 0; ··· 1245 1242 if (err < 0) 1246 1243 goto err2; 1247 1244 1248 - master_h = __nf_conntrack_find(&init_net, &master); 1245 + master_h = nf_conntrack_find_get(&init_net, &master); 1249 1246 if (master_h == NULL) { 1250 1247 err = -ENOENT; 1251 1248 goto err2; 1252 1249 } 1253 1250 master_ct = nf_ct_tuplehash_to_ctrack(master_h); 1254 - nf_conntrack_get(&master_ct->ct_general); 1255 1251 __set_bit(IPS_EXPECTED_BIT, &ct->status); 1256 1252 ct->master = master_ct; 1257 1253 }
+33 -24
net/netfilter/nf_conntrack_standalone.c
··· 44 44 unsigned int bucket; 45 45 }; 46 46 47 - static struct hlist_node *ct_get_first(struct seq_file *seq) 47 + static struct hlist_nulls_node *ct_get_first(struct seq_file *seq) 48 48 { 49 49 struct net *net = seq_file_net(seq); 50 50 struct ct_iter_state *st = seq->private; 51 - struct hlist_node *n; 51 + struct hlist_nulls_node *n; 52 52 53 53 for (st->bucket = 0; 54 54 st->bucket < nf_conntrack_htable_size; 55 55 st->bucket++) { 56 56 n = rcu_dereference(net->ct.hash[st->bucket].first); 57 - if (n) 57 + if (!is_a_nulls(n)) 58 58 return n; 59 59 } 60 60 return NULL; 61 61 } 62 62 63 - static struct hlist_node *ct_get_next(struct seq_file *seq, 64 - struct hlist_node *head) 63 + static struct hlist_nulls_node *ct_get_next(struct seq_file *seq, 64 + struct hlist_nulls_node *head) 65 65 { 66 66 struct net *net = seq_file_net(seq); 67 67 struct ct_iter_state *st = seq->private; 68 68 69 69 head = rcu_dereference(head->next); 70 - while (head == NULL) { 71 - if (++st->bucket >= nf_conntrack_htable_size) 72 - return NULL; 70 + while (is_a_nulls(head)) { 71 + if (likely(get_nulls_value(head) == st->bucket)) { 72 + if (++st->bucket >= nf_conntrack_htable_size) 73 + return NULL; 74 + } 73 75 head = rcu_dereference(net->ct.hash[st->bucket].first); 74 76 } 75 77 return head; 76 78 } 77 79 78 - static struct hlist_node *ct_get_idx(struct seq_file *seq, loff_t pos) 80 + static struct hlist_nulls_node *ct_get_idx(struct seq_file *seq, loff_t pos) 79 81 { 80 - struct hlist_node *head = ct_get_first(seq); 82 + struct hlist_nulls_node *head = ct_get_first(seq); 81 83 82 84 if (head) 83 85 while (pos && (head = ct_get_next(seq, head))) ··· 109 107 /* return 0 on success, 1 in case of error */ 110 108 static int ct_seq_show(struct seq_file *s, void *v) 111 109 { 112 - const struct nf_conntrack_tuple_hash *hash = v; 113 - const struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(hash); 110 + struct nf_conntrack_tuple_hash *hash = v; 111 + struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(hash); 114 112 const struct nf_conntrack_l3proto *l3proto; 115 113 const struct nf_conntrack_l4proto *l4proto; 114 + int ret = 0; 116 115 117 116 NF_CT_ASSERT(ct); 117 + if (unlikely(!atomic_inc_not_zero(&ct->ct_general.use))) 118 + return 0; 118 119 119 120 /* we only want to print DIR_ORIGINAL */ 120 121 if (NF_CT_DIRECTION(hash)) 121 - return 0; 122 + goto release; 122 123 123 124 l3proto = __nf_ct_l3proto_find(nf_ct_l3num(ct)); 124 125 NF_CT_ASSERT(l3proto); 125 126 l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); 126 127 NF_CT_ASSERT(l4proto); 127 128 129 + ret = -ENOSPC; 128 130 if (seq_printf(s, "%-8s %u %-8s %u %ld ", 129 131 l3proto->name, nf_ct_l3num(ct), 130 132 l4proto->name, nf_ct_protonum(ct), 131 133 timer_pending(&ct->timeout) 132 134 ? (long)(ct->timeout.expires - jiffies)/HZ : 0) != 0) 133 - return -ENOSPC; 135 + goto release; 134 136 135 137 if (l4proto->print_conntrack && l4proto->print_conntrack(s, ct)) 136 - return -ENOSPC; 138 + goto release; 137 139 138 140 if (print_tuple(s, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 139 141 l3proto, l4proto)) 140 - return -ENOSPC; 142 + goto release; 141 143 142 144 if (seq_print_acct(s, ct, IP_CT_DIR_ORIGINAL)) 143 - return -ENOSPC; 145 + goto release; 144 146 145 147 if (!(test_bit(IPS_SEEN_REPLY_BIT, &ct->status))) 146 148 if (seq_printf(s, "[UNREPLIED] ")) 147 - return -ENOSPC; 149 + goto release; 148 150 149 151 if (print_tuple(s, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, 150 152 l3proto, l4proto)) 151 - return -ENOSPC; 153 + goto release; 152 154 153 155 if (seq_print_acct(s, ct, IP_CT_DIR_REPLY)) 154 - return -ENOSPC; 156 + goto release; 155 157 156 158 if (test_bit(IPS_ASSURED_BIT, &ct->status)) 157 159 if (seq_printf(s, "[ASSURED] ")) 158 - return -ENOSPC; 160 + goto release; 159 161 160 162 #if defined(CONFIG_NF_CONNTRACK_MARK) 161 163 if (seq_printf(s, "mark=%u ", ct->mark)) 162 - return -ENOSPC; 164 + goto release; 163 165 #endif 164 166 165 167 #ifdef CONFIG_NF_CONNTRACK_SECMARK 166 168 if (seq_printf(s, "secmark=%u ", ct->secmark)) 167 - return -ENOSPC; 169 + goto release; 168 170 #endif 169 171 170 172 if (seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use))) 171 - return -ENOSPC; 173 + goto release; 172 174 175 + ret = 0; 176 + release: 177 + nf_ct_put(ct); 173 178 return 0; 174 179 } 175 180
+4 -2
net/netfilter/xt_connlimit.c
··· 108 108 const struct nf_conntrack_tuple_hash *found; 109 109 struct xt_connlimit_conn *conn; 110 110 struct xt_connlimit_conn *tmp; 111 - const struct nf_conn *found_ct; 111 + struct nf_conn *found_ct; 112 112 struct list_head *hash; 113 113 bool addit = true; 114 114 int matches = 0; ··· 123 123 124 124 /* check the saved connections */ 125 125 list_for_each_entry_safe(conn, tmp, hash, list) { 126 - found = __nf_conntrack_find(&init_net, &conn->tuple); 126 + found = nf_conntrack_find_get(&init_net, &conn->tuple); 127 127 found_ct = NULL; 128 128 129 129 if (found != NULL) ··· 151 151 * we do not care about connections which are 152 152 * closed already -> ditch it 153 153 */ 154 + nf_ct_put(found_ct); 154 155 list_del(&conn->list); 155 156 kfree(conn); 156 157 continue; ··· 161 160 match->family)) 162 161 /* same source network -> be counted! */ 163 162 ++matches; 163 + nf_ct_put(found_ct); 164 164 } 165 165 166 166 rcu_read_unlock();